diff options
author | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
commit | dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch) | |
tree | dcebc53f2b182f145a2e659393bf9a0472cedf23 /lib/Target/X86 | |
parent | 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff) | |
download | external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2 |
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'lib/Target/X86')
78 files changed, 3701 insertions, 2985 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 73031de..0d0a9ca 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -12,7 +12,6 @@ x86_codegen_TBLGEN_TABLES := \ x86_codegen_SRC_FILES := \ X86AsmPrinter.cpp \ - X86COFFMachineModuleInfo.cpp \ X86CodeEmitter.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index db29228..f3e6b3f 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -11,21 +11,25 @@ #include "X86AsmInstrumentation.h" #include "X86Operand.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" namespace llvm { namespace { -static cl::opt<bool> ClAsanInstrumentInlineAssembly( - "asan-instrument-inline-assembly", cl::desc("instrument inline assembly"), - cl::Hidden, cl::init(false)); +static cl::opt<bool> ClAsanInstrumentAssembly( + "asan-instrument-assembly", + cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, + cl::init(false)); bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP; @@ -38,14 +42,14 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) { class X86AddressSanitizer : public X86AsmInstrumentation { public: - X86AddressSanitizer(MCSubtargetInfo &sti) : STI(sti) {} + X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {} virtual ~X86AddressSanitizer() {} // X86AsmInstrumentation implementation: virtual void InstrumentInstruction( const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, MCStreamer &Out) override { - InstrumentMOV(Inst, Operands, Ctx, Out); + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override { + InstrumentMOV(Inst, Operands, Ctx, MII, Out); } // Should be implemented differently in x86_32 and x86_64 subclasses. @@ -57,13 +61,13 @@ public: bool IsWrite, MCContext &Ctx, MCStreamer &Out); void InstrumentMOV(const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, MCStreamer &Out); + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst) { Out.EmitInstruction(Inst, STI); } protected: - MCSubtargetInfo &STI; + const MCSubtargetInfo &STI; }; void X86AddressSanitizer::InstrumentMemOperand( @@ -83,68 +87,53 @@ void X86AddressSanitizer::InstrumentMemOperand( void X86AddressSanitizer::InstrumentMOV( const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, MCStreamer &Out) { + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; - unsigned long OpIx = Operands.size(); + switch (Inst.getOpcode()) { case X86::MOV8mi: case X86::MOV8mr: - AccessSize = 1; - OpIx = 2; - break; case X86::MOV8rm: AccessSize = 1; - OpIx = 1; break; case X86::MOV16mi: case X86::MOV16mr: - AccessSize = 2; - OpIx = 2; - break; case X86::MOV16rm: AccessSize = 2; - OpIx = 1; break; case X86::MOV32mi: case X86::MOV32mr: - AccessSize = 4; - OpIx = 2; - break; case X86::MOV32rm: AccessSize = 4; - OpIx = 1; break; case X86::MOV64mi32: case X86::MOV64mr: - AccessSize = 8; - OpIx = 2; - break; case X86::MOV64rm: AccessSize = 8; - OpIx = 1; break; case X86::MOVAPDmr: case X86::MOVAPSmr: - AccessSize = 16; - OpIx = 2; - break; case X86::MOVAPDrm: case X86::MOVAPSrm: AccessSize = 16; - OpIx = 1; break; - } - if (OpIx >= Operands.size()) + default: return; + } - const bool IsWrite = (OpIx != 1); - InstrumentMemOperand(Operands[OpIx], AccessSize, IsWrite, Ctx, Out); + const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); + for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { + MCParsedAsmOperand *Op = Operands[Ix]; + if (Op && Op->isMem()) + InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out); + } } class X86AddressSanitizer32 : public X86AddressSanitizer { public: - X86AddressSanitizer32(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {} + X86AddressSanitizer32(const MCSubtargetInfo &STI) + : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer32() {} virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, @@ -172,14 +161,14 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl( MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr)); } - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP) - .addReg(X86::ESP).addImm(4)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } class X86AddressSanitizer64 : public X86AddressSanitizer { public: - X86AddressSanitizer64(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {} + X86AddressSanitizer64(const MCSubtargetInfo &STI) + : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer64() {} virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, @@ -187,13 +176,26 @@ public: MCStreamer &Out) override; }; -void X86AddressSanitizer64::InstrumentMemOperandImpl( - X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { +void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op, + unsigned AccessSize, + bool IsWrite, + MCContext &Ctx, + MCStreamer &Out) { // FIXME: emit .cfi directives for correct stack unwinding. - // Set %rsp below current red zone (128 bytes wide) - EmitInstruction(Out, MCInstBuilder(X86::SUB64ri32).addReg(X86::RSP) - .addReg(X86::RSP).addImm(128)); + + // Set %rsp below current red zone (128 bytes wide) using LEA instruction to + // preserve flags. + { + MCInst Inst; + Inst.setOpcode(X86::LEA64r); + Inst.addOperand(MCOperand::CreateReg(X86::RSP)); + + const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); { MCInst Inst; @@ -210,8 +212,19 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl( EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr)); } EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::ADD64ri32).addReg(X86::RSP) - .addReg(X86::RSP).addImm(128)); + + // Restore old %rsp value. + { + MCInst Inst; + Inst.setOpcode(X86::LEA64r); + Inst.addOperand(MCOperand::CreateReg(X86::RSP)); + + const MCExpr *Disp = MCConstantExpr::Create(128, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } } } // End anonymous namespace @@ -221,10 +234,15 @@ X86AsmInstrumentation::~X86AsmInstrumentation() {} void X86AsmInstrumentation::InstrumentInstruction( const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, MCStreamer &Out) {} - -X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI) { - if (ClAsanInstrumentInlineAssembly) { + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {} + +X86AsmInstrumentation * +CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, const MCSubtargetInfo &STI) { + Triple T(STI.getTargetTriple()); + const bool hasCompilerRTSupport = T.isOSLinux(); + if (ClAsanInstrumentAssembly && hasCompilerRTSupport && + MCOptions.SanitizeAddress) { if ((STI.getFeatureBits() & X86::Mode32Bit) != 0) return new X86AddressSanitizer32(STI); if ((STI.getFeatureBits() & X86::Mode64Bit) != 0) diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index c783a78..0369b14 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -16,13 +16,17 @@ namespace llvm { class MCContext; class MCInst; +class MCInstrInfo; class MCParsedAsmOperand; class MCStreamer; class MCSubtargetInfo; +class MCTargetOptions; class X86AsmInstrumentation; -X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI); +X86AsmInstrumentation * +CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, const MCSubtargetInfo &STI); class X86AsmInstrumentation { public: @@ -32,15 +36,18 @@ public: // instruction is sent to Out. virtual void InstrumentInstruction( const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, MCStreamer &Out); + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out); protected: friend X86AsmInstrumentation * - CreateX86AsmInstrumentation(MCSubtargetInfo &STI); + CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, const MCSubtargetInfo &STI); X86AsmInstrumentation(); }; -} // End llvm namespace +} // End llvm namespace -#endif // X86_ASM_INSTRUMENTATION_H +#endif // X86_ASM_INSTRUMENTATION_H diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 9eddc74..d3e695e 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" @@ -55,6 +56,7 @@ static const char OpPrecedence[] = { class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; MCAsmParser &Parser; + const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; private: @@ -257,7 +259,7 @@ private: public: IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), - Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac), + Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac), AddImmPrefix(addimmprefix) { Info.clear(); } unsigned getBaseReg() { return BaseReg; } @@ -618,7 +620,7 @@ private: X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) { Error(Loc, Msg); - return 0; + return nullptr; } X86Operand *DefaultMemSIOperand(SMLoc Loc); @@ -710,13 +712,17 @@ private: public: X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, - const MCInstrInfo &MII) - : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) { + const MCInstrInfo &mii, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii), + InstInfo(nullptr) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - Instrumentation.reset(CreateX86AsmInstrumentation(STI)); + Instrumentation.reset( + CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; bool @@ -1173,9 +1179,9 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // expression. IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); if (ParseIntelExpression(SM, End)) - return 0; + return nullptr; - const MCExpr *Disp = 0; + const MCExpr *Disp = nullptr; if (const MCExpr *Sym = SM.getSym()) { // A symbolic displacement. Disp = Sym; @@ -1199,7 +1205,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, if (Tok.getString().find('.') != StringRef::npos) { const MCExpr *NewDisp; if (ParseIntelDotOperator(Disp, NewDisp)) - return 0; + return nullptr; End = Tok.getEndLoc(); Parser.Lex(); // Eat the field. @@ -1220,7 +1226,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, StringRef ErrMsg; if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { Error(StartInBrac, ErrMsg); - return 0; + return nullptr; } return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, End, Size); @@ -1237,7 +1243,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); - Val = 0; + Val = nullptr; StringRef LineBuf(Identifier.data()); SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); @@ -1309,7 +1315,7 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, StringRef Identifier = Tok.getString(); if (ParseIntelIdentifier(Val, Identifier, Info, /*Unevaluated=*/false, End)) - return 0; + return nullptr; return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, /*Scale=*/1, Start, End, Size, Identifier, Info); } @@ -1337,7 +1343,7 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, StringRef Identifier = Tok.getString(); if (ParseIntelIdentifier(Val, Identifier, Info, /*Unevaluated=*/false, End)) - return 0; + return nullptr; if (!getLexer().is(AsmToken::LBrac)) return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0, @@ -1349,19 +1355,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); if (ParseIntelExpression(SM, End)) - return 0; + return nullptr; if (SM.getSym()) { Error(Start, "cannot use more than one symbol in memory operand"); - return 0; + return nullptr; } if (SM.getBaseReg()) { Error(Start, "cannot use base register with variable reference"); - return 0; + return nullptr; } if (SM.getIndexReg()) { Error(Start, "cannot use index register with variable reference"); - return 0; + return nullptr; } const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext()); @@ -1430,7 +1436,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { StringRef Identifier = Tok.getString(); if (ParseIntelIdentifier(Val, Identifier, Info, /*Unevaluated=*/false, End)) - return 0; + return nullptr; // Don't emit the offset operator. InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); @@ -1461,13 +1467,13 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. - const MCExpr *Val = 0; + const MCExpr *Val = nullptr; InlineAsmIdentifierInfo Info; SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); if (ParseIntelIdentifier(Val, Identifier, Info, /*Unevaluated=*/true, End)) - return 0; + return nullptr; if (!Info.OpDecl) return ErrorOperand(Start, "unable to lookup expression"); @@ -1522,7 +1528,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() { IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); if (ParseIntelExpression(SM, End)) - return 0; + return nullptr; int64_t Imm = SM.getImm(); if (isParsingInlineAsm()) { @@ -1580,11 +1586,11 @@ X86Operand *X86AsmParser::ParseATTOperand() { // Read the register. unsigned RegNo; SMLoc Start, End; - if (ParseRegister(RegNo, Start, End)) return 0; + if (ParseRegister(RegNo, Start, End)) return nullptr; if (RegNo == X86::EIZ || RegNo == X86::RIZ) { Error(Start, "%eiz and %riz can only be used as index registers", SMRange(Start, End)); - return 0; + return nullptr; } // If this is a segment register followed by a ':', then this is the start @@ -1601,7 +1607,7 @@ X86Operand *X86AsmParser::ParseATTOperand() { Parser.Lex(); const MCExpr *Val; if (getParser().parseExpression(Val, End)) - return 0; + return nullptr; return X86Operand::CreateImm(Val, Start, End); } } @@ -1630,7 +1636,7 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands StringSwitch<const char*>(getLexer().getTok().getIdentifier()) .Case("to8", "{1to8}") .Case("to16", "{1to16}") - .Default(0); + .Default(nullptr); if (!BroadcastPrimitive) return !ErrorAndEatStatement(getLexer().getLoc(), "Invalid memory broadcast primitive."); @@ -1685,7 +1691,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); if (getLexer().isNot(AsmToken::LParen)) { SMLoc ExprEnd; - if (getParser().parseExpression(Disp, ExprEnd)) return 0; + if (getParser().parseExpression(Disp, ExprEnd)) return nullptr; // After parsing the base expression we could either have a parenthesized // memory address or not. If not, return now. If so, eat the (. @@ -1712,7 +1718,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // It must be an parenthesized expression, parse it now. if (getParser().parseParenExpression(Disp, ExprEnd)) - return 0; + return nullptr; // After parsing the base expression we could either have a parenthesized // memory address or not. If not, return now. If so, eat the (. @@ -1736,11 +1742,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { if (getLexer().is(AsmToken::Percent)) { SMLoc StartLoc, EndLoc; BaseLoc = Parser.getTok().getLoc(); - if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0; + if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr; if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) { Error(StartLoc, "eiz and riz can only be used as index registers", SMRange(StartLoc, EndLoc)); - return 0; + return nullptr; } } @@ -1756,7 +1762,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. if (getLexer().is(AsmToken::Percent)) { SMLoc L; - if (ParseRegister(IndexReg, L, L)) return 0; + if (ParseRegister(IndexReg, L, L)) return nullptr; if (getLexer().isNot(AsmToken::RParen)) { // Parse the scale amount: @@ -1764,7 +1770,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { if (getLexer().isNot(AsmToken::Comma)) { Error(Parser.getTok().getLoc(), "expected comma in scale expression"); - return 0; + return nullptr; } Parser.Lex(); // Eat the comma. @@ -1774,18 +1780,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { int64_t ScaleVal; if (getParser().parseAbsoluteExpression(ScaleVal)){ Error(Loc, "expected scale expression"); - return 0; + return nullptr; } // Validate the scale amount. if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && ScaleVal != 1) { Error(Loc, "scale factor in 16-bit address must be 1"); - return 0; + return nullptr; } if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); - return 0; + return nullptr; } Scale = (unsigned)ScaleVal; } @@ -1797,7 +1803,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { int64_t Value; if (getParser().parseAbsoluteExpression(Value)) - return 0; + return nullptr; if (Value != 1) Warning(Loc, "scale factor without index register is ignored"); @@ -1808,7 +1814,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. if (getLexer().isNot(AsmToken::RParen)) { Error(Parser.getTok().getLoc(), "unexpected token in memory operand"); - return 0; + return nullptr; } SMLoc MemEnd = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat the ')'. @@ -1821,18 +1827,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { BaseReg != X86::SI && BaseReg != X86::DI)) && BaseReg != X86::DX) { Error(BaseLoc, "invalid 16-bit base register"); - return 0; + return nullptr; } if (BaseReg == 0 && X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) { Error(IndexLoc, "16-bit memory operand may not include only index register"); - return 0; + return nullptr; } StringRef ErrMsg; if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { Error(BaseLoc, ErrMsg); - return 0; + return nullptr; } return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, @@ -1851,7 +1857,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, PatchedName = PatchedName.substr(0, Name.size()-1); // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. - const MCExpr *ExtraImmOp = 0; + const MCExpr *ExtraImmOp = nullptr; if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { @@ -2070,8 +2076,10 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, (Name == "smov" || Name == "smovb" || Name == "smovw" || Name == "smovl" || Name == "smovd" || Name == "smovq"))) { if (Operands.size() == 1) { - if (Name == "movsd") + if (Name == "movsd") { + delete Operands.back(); Operands.back() = X86Operand::CreateToken("movsl", NameLoc); + } if (isParsingIntelSyntax()) { Operands.push_back(DefaultMemDIOperand(NameLoc)); Operands.push_back(DefaultMemSIOperand(NameLoc)); @@ -2253,7 +2261,8 @@ static const char *getSubtargetFeatureName(unsigned Val); void X86AsmParser::EmitInstruction( MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out) { - Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), Out); + Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII, + Out); Out.EmitInstruction(Inst, STI); } @@ -2291,7 +2300,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, .Case("fstsw", "fnstsw") .Case("fstsww", "fnstsw") .Case("fclex", "fnclex") - .Default(0); + .Default(nullptr); assert(Repl && "Unknown wait-prefixed instruction"); delete Operands[0]; Operands[0] = X86Operand::CreateToken(Repl, IDLoc); diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 45fe2a9..de3be38 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -422,7 +422,7 @@ struct X86Operand : public MCParsedAsmOperand { bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), StringRef SymName = StringRef(), - void *OpDecl = 0) { + void *OpDecl = nullptr) { X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; @@ -441,7 +441,7 @@ struct X86Operand : public MCParsedAsmOperand { /// Create an absolute memory operand. static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = 0) { + void *OpDecl = nullptr) { X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -461,7 +461,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = 0) { + void *OpDecl = nullptr) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 206b651..c54fbc1 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,7 +14,6 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp - X86COFFMachineModuleInfo.cpp X86CodeEmitter.cpp X86FastISel.cpp X86FloatingPoint.cpp diff --git a/lib/Target/X86/Disassembler/Android.mk b/lib/Target/X86/Disassembler/Android.mk index 3984266..0b3b8a5 100644 --- a/lib/Target/X86/Disassembler/Android.mk +++ b/lib/Target/X86/Disassembler/Android.mk @@ -8,7 +8,8 @@ x86_disassembler_TBLGEN_TABLES := \ x86_disassembler_SRC_FILES := \ X86Disassembler.cpp \ - X86DisassemblerDecoder.c + X86DisassemblerDecoder.cpp + # For the device # ===================================================== diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt index deed115..4370282 100644 --- a/lib/Target/X86/Disassembler/CMakeLists.txt +++ b/lib/Target/X86/Disassembler/CMakeLists.txt @@ -1,4 +1,4 @@ add_llvm_library(LLVMX86Disassembler X86Disassembler.cpp - X86DisassemblerDecoder.c + X86DisassemblerDecoder.cpp ) diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile index 8669fd8..51e7b82 100644 --- a/lib/Target/X86/Disassembler/Makefile +++ b/lib/Target/X86/Disassembler/Makefile @@ -10,7 +10,9 @@ LEVEL = ../../../.. LIBRARYNAME = LLVMX86Disassembler -# Hack: we need to include 'main' x86 target directory to grab private headers +# Hack: we need to include 'main' x86 target directory to grab private headers. CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. include $(LEVEL)/Makefile.common + +.PHONY: $(PROJ_SRC_DIR)/X86DisassemblerDecoder.c diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index d5759cd..c366725 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -27,6 +27,11 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +using namespace llvm; +using namespace llvm::X86Disassembler; + +#define DEBUG_TYPE "x86-disassembler" + #define GET_REGINFO_ENUM #include "X86GenRegisterInfo.inc" #define GET_INSTRINFO_ENUM @@ -34,21 +39,18 @@ #define GET_SUBTARGETINFO_ENUM #include "X86GenSubtargetInfo.inc" -using namespace llvm; -using namespace llvm::X86Disassembler; - -void x86DisassemblerDebug(const char *file, - unsigned line, - const char *s) { +void llvm::X86Disassembler::Debug(const char *file, unsigned line, + const char *s) { dbgs() << file << ":" << line << ": " << s; } -const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) { +const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode, + const void *mii) { const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); return MII->getName(Opcode); } -#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s)); +#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s)); namespace llvm { @@ -74,9 +76,11 @@ static bool translateInstruction(MCInst &target, InternalInstruction &source, const MCDisassembler *Dis); -X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, - const MCInstrInfo *MII) - : MCDisassembler(STI), MII(MII) { +X86GenericDisassembler::X86GenericDisassembler( + const MCSubtargetInfo &STI, + MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MII) + : MCDisassembler(STI, Ctx), MII(std::move(MII)) { switch (STI.getFeatureBits() & (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) { case X86::Mode16Bit: @@ -93,10 +97,6 @@ X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, } } -X86GenericDisassembler::~X86GenericDisassembler() { - delete MII; -} - /// regionReader - a callback function that wraps the readByte method from /// MemoryObject. /// @@ -140,14 +140,14 @@ X86GenericDisassembler::getInstruction(MCInst &instr, dlog_t loggerFn = logger; if (&vStream == &nulls()) - loggerFn = 0; // Disable logging completely if it's going to nulls(). + loggerFn = nullptr; // Disable logging completely if it's going to nulls(). int ret = decodeInstruction(&internalInstr, regionReader, (const void*)®ion, loggerFn, (void*)&vStream, - (const void*)MII, + (const void*)MII.get(), address, fMode); @@ -319,7 +319,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } // By default sign-extend all X86 immediates based on their encoding. else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || - type == TYPE_IMM64) { + type == TYPE_IMM64 || type == TYPE_IMMv) { uint32_t Opcode = mcInst.getOpcode(); switch (operand.encoding) { default: @@ -787,13 +787,11 @@ static bool translateInstruction(MCInst &mcInst, mcInst.setOpcode(X86::XACQUIRE_PREFIX); } - int index; - insn.numImmediatesTranslated = 0; - for (index = 0; index < X86_MAX_OPERANDS; ++index) { - if (insn.operands[index].encoding != ENCODING_NONE) { - if (translateOperand(mcInst, insn.operands[index], insn, Dis)) { + for (const auto &Op : insn.operands) { + if (Op.encoding != ENCODING_NONE) { + if (translateOperand(mcInst, Op, insn, Dis)) { return true; } } @@ -803,9 +801,10 @@ static bool translateInstruction(MCInst &mcInst, } static MCDisassembler *createX86Disassembler(const Target &T, - const MCSubtargetInfo &STI) { - return new X86Disassembler::X86GenericDisassembler(STI, - T.createMCInstrInfo()); + const MCSubtargetInfo &STI, + MCContext &Ctx) { + std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo()); + return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII)); } extern "C" void LLVMInitializeX86Disassembler() { diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h index 4e6e297..4dc7c29 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.h +++ b/lib/Target/X86/Disassembler/X86Disassembler.h @@ -74,17 +74,7 @@ #ifndef X86DISASSEMBLER_H #define X86DISASSEMBLER_H -#define INSTRUCTION_SPECIFIER_FIELDS \ - uint16_t operands; - -#define INSTRUCTION_IDS \ - uint16_t instructionIDs; - #include "X86DisassemblerDecoderCommon.h" - -#undef INSTRUCTION_SPECIFIER_FIELDS -#undef INSTRUCTION_IDS - #include "llvm/MC/MCDisassembler.h" namespace llvm { @@ -101,13 +91,12 @@ namespace X86Disassembler { /// All each platform class should have to do is subclass the constructor, and /// provide a different disassemblerMode value. class X86GenericDisassembler : public MCDisassembler { - const MCInstrInfo *MII; + std::unique_ptr<const MCInstrInfo> MII; public: /// Constructor - Initializes the disassembler. /// - X86GenericDisassembler(const MCSubtargetInfo &STI, const MCInstrInfo *MII); -private: - ~X86GenericDisassembler(); + X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MII); public: /// getInstruction - See MCDisassembler. diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 0801c96..804606d 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1,17 +1,17 @@ -/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===* - * - * The LLVM Compiler Infrastructure - * - * This file is distributed under the University of Illinois Open Source - * License. See LICENSE.TXT for details. - * - *===----------------------------------------------------------------------===* - * - * This file is part of the X86 Disassembler. - * It contains the implementation of the instruction decoder. - * Documentation for the disassembler can be found in X86Disassembler.h. - * - *===----------------------------------------------------------------------===*/ +//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains the implementation of the instruction decoder. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// #include <stdarg.h> /* for va_*() */ #include <stdio.h> /* for vsnprintf() */ @@ -20,13 +20,35 @@ #include "X86DisassemblerDecoder.h" -#include "X86GenDisassemblerTables.inc" +using namespace llvm::X86Disassembler; + +/// Specifies whether a ModR/M byte is needed and (if so) which +/// instruction each possible value of the ModR/M byte corresponds to. Once +/// this information is known, we have narrowed down to a single instruction. +struct ModRMDecision { + uint8_t modrm_type; + uint16_t instructionIDs; +}; + +/// Specifies which set of ModR/M->instruction tables to look at +/// given a particular opcode. +struct OpcodeDecision { + ModRMDecision modRMDecisions[256]; +}; + +/// Specifies which opcode->instruction tables to look at given +/// a particular context (set of attributes). Since there are many possible +/// contexts, the decoder first uses CONTEXTS_SYM to determine which context +/// applies given a specific set of attributes. Hence there are only IC_max +/// entries in this table, rather than 2^(ATTR_max). +struct ContextDecision { + OpcodeDecision opcodeDecisions[IC_max]; +}; -#define TRUE 1 -#define FALSE 0 +#include "X86GenDisassemblerTables.inc" #ifndef NDEBUG -#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) +#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0) #else #define debug(s) do { } while (0) #endif @@ -41,7 +63,7 @@ * an instruction with these attributes. */ static InstructionContext contextForAttrs(uint16_t attrMask) { - return CONTEXTS_SYM[attrMask]; + return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]); } /* @@ -53,12 +75,12 @@ static InstructionContext contextForAttrs(uint16_t attrMask) { * contextForAttrs. * @param opcode - The last byte of the instruction's opcode, not counting * ModR/M extensions and escapes. - * @return - TRUE if the ModR/M byte is required, FALSE otherwise. + * @return - true if the ModR/M byte is required, false otherwise. */ static int modRMRequired(OpcodeType type, InstructionContext insnContext, uint16_t opcode) { - const struct ContextDecision* decision = 0; + const struct ContextDecision* decision = nullptr; switch (type) { case ONEBYTE: @@ -102,7 +124,7 @@ static InstrUID decode(OpcodeType type, InstructionContext insnContext, uint8_t opcode, uint8_t modRM) { - const struct ModRMDecision* dec = 0; + const struct ModRMDecision* dec = nullptr; switch (type) { case ONEBYTE: @@ -284,15 +306,15 @@ static void setPrefixPresent(struct InternalInstruction* insn, * @param location - The location to query. * @return - Whether the prefix is at that location. */ -static BOOL isPrefixAtLocation(struct InternalInstruction* insn, +static bool isPrefixAtLocation(struct InternalInstruction* insn, uint8_t prefix, uint64_t location) { if (insn->prefixPresent[prefix] == 1 && insn->prefixLocations[prefix] == location) - return TRUE; + return true; else - return FALSE; + return false; } /* @@ -305,14 +327,14 @@ static BOOL isPrefixAtLocation(struct InternalInstruction* insn, * bytes, and no prefixes conflicted; nonzero otherwise. */ static int readPrefixes(struct InternalInstruction* insn) { - BOOL isPrefix = TRUE; - BOOL prefixGroups[4] = { FALSE }; + bool isPrefix = true; + bool prefixGroups[4] = { false }; uint64_t prefixLocation; uint8_t byte = 0; uint8_t nextByte; - BOOL hasAdSize = FALSE; - BOOL hasOpSize = FALSE; + bool hasAdSize = false; + bool hasOpSize = false; dbgprintf(insn, "readPrefixes()"); @@ -344,7 +366,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if ((byte == 0xf2 || byte == 0xf3) && ((nextByte == 0xf0) | ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) - insn->xAcquireRelease = TRUE; + insn->xAcquireRelease = true; /* * Also if the byte is 0xf3, and the following condition is met: * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or @@ -354,7 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || nextByte == 0xc6 || nextByte == 0xc7)) - insn->xAcquireRelease = TRUE; + insn->xAcquireRelease = true; if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { if (consumeByte(insn, &nextByte)) return -1; @@ -372,7 +394,7 @@ static int readPrefixes(struct InternalInstruction* insn) { case 0xf3: /* REP or REPE/REPZ */ if (prefixGroups[0]) dbgprintf(insn, "Redundant Group 1 prefix"); - prefixGroups[0] = TRUE; + prefixGroups[0] = true; setPrefixPresent(insn, byte, prefixLocation); break; case 0x2e: /* CS segment override -OR- Branch not taken */ @@ -406,25 +428,25 @@ static int readPrefixes(struct InternalInstruction* insn) { } if (prefixGroups[1]) dbgprintf(insn, "Redundant Group 2 prefix"); - prefixGroups[1] = TRUE; + prefixGroups[1] = true; setPrefixPresent(insn, byte, prefixLocation); break; case 0x66: /* Operand-size override */ if (prefixGroups[2]) dbgprintf(insn, "Redundant Group 3 prefix"); - prefixGroups[2] = TRUE; - hasOpSize = TRUE; + prefixGroups[2] = true; + hasOpSize = true; setPrefixPresent(insn, byte, prefixLocation); break; case 0x67: /* Address-size override */ if (prefixGroups[3]) dbgprintf(insn, "Redundant Group 4 prefix"); - prefixGroups[3] = TRUE; - hasAdSize = TRUE; + prefixGroups[3] = true; + hasAdSize = true; setPrefixPresent(insn, byte, prefixLocation); break; default: /* Not a prefix byte */ - isPrefix = FALSE; + isPrefix = false; break; } @@ -549,7 +571,7 @@ static int readPrefixes(struct InternalInstruction* insn) { default: break; case VEX_PREFIX_66: - hasOpSize = TRUE; + hasOpSize = true; break; } @@ -595,7 +617,7 @@ static int readPrefixes(struct InternalInstruction* insn) { default: break; case VEX_PREFIX_66: - hasOpSize = TRUE; + hasOpSize = true; break; } @@ -790,11 +812,9 @@ static int readModRM(struct InternalInstruction* insn); static int getIDWithAttrMask(uint16_t* instructionID, struct InternalInstruction* insn, uint16_t attrMask) { - BOOL hasModRMExtension; - - uint16_t instructionClass; + bool hasModRMExtension; - instructionClass = contextForAttrs(attrMask); + InstructionContext instructionClass = contextForAttrs(attrMask); hasModRMExtension = modRMRequired(insn->opcodeType, instructionClass, @@ -825,14 +845,14 @@ static int getIDWithAttrMask(uint16_t* instructionID, * @param orig - The instruction that is not 16-bit * @param equiv - The instruction that is 16-bit */ -static BOOL is16BitEquivalent(const char* orig, const char* equiv) { +static bool is16BitEquivalent(const char* orig, const char* equiv) { off_t i; for (i = 0;; i++) { if (orig[i] == '\0' && equiv[i] == '\0') - return TRUE; + return true; if (orig[i] == '\0' || equiv[i] == '\0') - return FALSE; + return false; if (orig[i] != equiv[i]) { if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') continue; @@ -840,7 +860,7 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) { continue; if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') continue; - return FALSE; + return false; } } } @@ -1011,9 +1031,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { return 0; } - specName = x86DisassemblerGetInstrName(instructionID, miiArg); - specWithOpSizeName = - x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg); + specName = GetInstrName(instructionID, miiArg); + specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg); if (is16BitEquivalent(specName, specWithOpSizeName) && (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) { @@ -1077,8 +1096,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { * @return - 0 if the SIB byte was successfully read; nonzero otherwise. */ static int readSIB(struct InternalInstruction* insn) { - SIBIndex sibIndexBase = 0; - SIBBase sibBaseBase = 0; + SIBIndex sibIndexBase = SIB_INDEX_NONE; + SIBBase sibBaseBase = SIB_BASE_NONE; uint8_t index, base; dbgprintf(insn, "readSIB()"); @@ -1086,7 +1105,7 @@ static int readSIB(struct InternalInstruction* insn) { if (insn->consumedSIB) return 0; - insn->consumedSIB = TRUE; + insn->consumedSIB = true; switch (insn->addressSize) { case 2: @@ -1184,12 +1203,12 @@ static int readDisplacement(struct InternalInstruction* insn) { if (insn->consumedDisplacement) return 0; - insn->consumedDisplacement = TRUE; + insn->consumedDisplacement = true; insn->displacementOffset = insn->readerCursor - insn->startLocation; switch (insn->eaDisplacement) { case EA_DISP_NONE: - insn->consumedDisplacement = FALSE; + insn->consumedDisplacement = false; break; case EA_DISP_8: if (consumeInt8(insn, &d8)) @@ -1208,7 +1227,7 @@ static int readDisplacement(struct InternalInstruction* insn) { break; } - insn->consumedDisplacement = TRUE; + insn->consumedDisplacement = true; return 0; } @@ -1229,7 +1248,7 @@ static int readModRM(struct InternalInstruction* insn) { if (consumeByte(insn, &insn->modRM)) return -1; - insn->consumedModRM = TRUE; + insn->consumedModRM = true; mod = modFromModRM(insn->modRM); rm = rmFromModRM(insn->modRM); @@ -1599,20 +1618,22 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { static int readVVVV(struct InternalInstruction* insn) { dbgprintf(insn, "readVVVV()"); + int vvvv; if (insn->vectorExtensionType == TYPE_EVEX) - insn->vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]); + vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]); else if (insn->vectorExtensionType == TYPE_VEX_3B) - insn->vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); + vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); else if (insn->vectorExtensionType == TYPE_VEX_2B) - insn->vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]); + vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]); else if (insn->vectorExtensionType == TYPE_XOP) - insn->vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]); + vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]); else return -1; if (insn->mode != MODE_64BIT) - insn->vvvv &= 0x7; + vvvv &= 0x7; + insn->vvvv = static_cast<Reg>(vvvv); return 0; } @@ -1629,7 +1650,8 @@ static int readMaskRegister(struct InternalInstruction* insn) { if (insn->vectorExtensionType != TYPE_EVEX) return -1; - insn->writemask = aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]); + insn->writemask = + static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])); return 0; } @@ -1641,7 +1663,6 @@ static int readMaskRegister(struct InternalInstruction* insn) { * @return - 0 if all operands could be read; nonzero otherwise. */ static int readOperands(struct InternalInstruction* insn) { - int index; int hasVVVV, needVVVV; int sawRegImm = 0; @@ -1652,8 +1673,8 @@ static int readOperands(struct InternalInstruction* insn) { hasVVVV = !readVVVV(insn); needVVVV = hasVVVV && (insn->vvvv != 0); - for (index = 0; index < X86_MAX_OPERANDS; ++index) { - switch (x86OperandSets[insn->spec->operands][index].encoding) { + for (const auto &Op : x86OperandSets[insn->spec->operands]) { + switch (Op.encoding) { case ENCODING_NONE: case ENCODING_SI: case ENCODING_DI: @@ -1662,7 +1683,7 @@ static int readOperands(struct InternalInstruction* insn) { case ENCODING_RM: if (readModRM(insn)) return -1; - if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index])) + if (fixupReg(insn, &Op)) return -1; break; case ENCODING_CB: @@ -1684,14 +1705,14 @@ static int readOperands(struct InternalInstruction* insn) { } if (readImmediate(insn, 1)) return -1; - if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 && + if (Op.type == TYPE_IMM3 && insn->immediates[insn->numImmediatesConsumed - 1] > 7) return -1; - if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 && + if (Op.type == TYPE_IMM5 && insn->immediates[insn->numImmediatesConsumed - 1] > 31) return -1; - if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 || - x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256) + if (Op.type == TYPE_XMM128 || + Op.type == TYPE_XMM256) sawRegImm = 1; break; case ENCODING_IW: @@ -1740,7 +1761,7 @@ static int readOperands(struct InternalInstruction* insn) { needVVVV = 0; /* Mark that we have found a VVVV operand. */ if (!hasVVVV) return -1; - if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index])) + if (fixupReg(insn, &Op)) return -1; break; case ENCODING_WRITEMASK: @@ -1781,14 +1802,10 @@ static int readOperands(struct InternalInstruction* insn) { * @return - 0 if the instruction's memory could be read; nonzero if * not. */ -int decodeInstruction(struct InternalInstruction* insn, - byteReader_t reader, - const void* readerArg, - dlog_t logger, - void* loggerArg, - const void* miiArg, - uint64_t startLoc, - DisassemblerMode mode) { +int llvm::X86Disassembler::decodeInstruction( + struct InternalInstruction *insn, byteReader_t reader, + const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg, + uint64_t startLoc, DisassemblerMode mode) { memset(insn, 0, sizeof(struct InternalInstruction)); insn->reader = reader; @@ -1807,7 +1824,7 @@ int decodeInstruction(struct InternalInstruction* insn, readOperands(insn)) return -1; - insn->operands = &x86OperandSets[insn->spec->operands][0]; + insn->operands = x86OperandSets[insn->spec->operands]; insn->length = insn->readerCursor - insn->startLocation; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index ac3b39d..8c45402 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -1,39 +1,28 @@ -/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===* - * - * The LLVM Compiler Infrastructure - * - * This file is distributed under the University of Illinois Open Source - * License. See LICENSE.TXT for details. - * - *===----------------------------------------------------------------------===* - * - * This file is part of the X86 Disassembler. - * It contains the public interface of the instruction decoder. - * Documentation for the disassembler can be found in X86Disassembler.h. - * - *===----------------------------------------------------------------------===*/ +//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains the public interface of the instruction decoder. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// #ifndef X86DISASSEMBLERDECODER_H #define X86DISASSEMBLERDECODER_H -#ifdef __cplusplus -extern "C" { -#endif - -#define INSTRUCTION_SPECIFIER_FIELDS \ - uint16_t operands; - -#define INSTRUCTION_IDS \ - uint16_t instructionIDs; - #include "X86DisassemblerDecoderCommon.h" +#include "llvm/ADT/ArrayRef.h" -#undef INSTRUCTION_SPECIFIER_FIELDS -#undef INSTRUCTION_IDS +namespace llvm { +namespace X86Disassembler { -/* - * Accessor functions for various fields of an Intel instruction - */ +// Accessor functions for various fields of an Intel instruction #define modFromModRM(modRM) (((modRM) & 0xc0) >> 6) #define regFromModRM(modRM) (((modRM) & 0x38) >> 3) #define rmFromModRM(modRM) ((modRM) & 0x7) @@ -83,10 +72,7 @@ extern "C" { #define lFromXOP3of3(xop) (((xop) & 0x4) >> 2) #define ppFromXOP3of3(xop) ((xop) & 0x3) -/* - * These enums represent Intel registers for use by the decoder. - */ - +// These enums represent Intel registers for use by the decoder. #define REGS_8BIT \ ENTRY(AL) \ ENTRY(CL) \ @@ -392,13 +378,11 @@ extern "C" { REGS_CONTROL \ ENTRY(RIP) -/* - * EABase - All possible values of the base field for effective-address - * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We - * distinguish between bases (EA_BASE_*) and registers that just happen to be - * referred to when Mod == 0b11 (EA_REG_*). - */ -typedef enum { +/// \brief All possible values of the base field for effective-address +/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte. +/// We distinguish between bases (EA_BASE_*) and registers that just happen +/// to be referred to when Mod == 0b11 (EA_REG_*). +enum EABase { EA_BASE_NONE, #define ENTRY(x) EA_BASE_##x, ALL_EA_BASES @@ -407,15 +391,13 @@ typedef enum { ALL_REGS #undef ENTRY EA_max -} EABase; - -/* - * SIBIndex - All possible values of the SIB index field. - * Borrows entries from ALL_EA_BASES with the special case that - * sib is synonymous with NONE. - * Vector SIB: index can be XMM or YMM. - */ -typedef enum { +}; + +/// \brief All possible values of the SIB index field. +/// borrows entries from ALL_EA_BASES with the special case that +/// sib is synonymous with NONE. +/// Vector SIB: index can be XMM or YMM. +enum SIBIndex { SIB_INDEX_NONE, #define ENTRY(x) SIB_INDEX_##x, ALL_EA_BASES @@ -424,23 +406,18 @@ typedef enum { REGS_ZMM #undef ENTRY SIB_INDEX_max -} SIBIndex; +}; -/* - * SIBBase - All possible values of the SIB base field. - */ -typedef enum { +/// \brief All possible values of the SIB base field. +enum SIBBase { SIB_BASE_NONE, #define ENTRY(x) SIB_BASE_##x, ALL_SIB_BASES #undef ENTRY SIB_BASE_max -} SIBBase; +}; -/* - * EADisplacement - Possible displacement types for effective-address - * computations. - */ +/// \brief Possible displacement types for effective-address computations. typedef enum { EA_DISP_NONE, EA_DISP_8, @@ -448,20 +425,16 @@ typedef enum { EA_DISP_32 } EADisplacement; -/* - * Reg - All possible values of the reg field in the ModR/M byte. - */ -typedef enum { +/// \brief All possible values of the reg field in the ModR/M byte. +enum Reg { #define ENTRY(x) MODRM_REG_##x, ALL_REGS #undef ENTRY MODRM_REG_max -} Reg; +}; -/* - * SegmentOverride - All possible segment overrides. - */ -typedef enum { +/// \brief All possible segment overrides. +enum SegmentOverride { SEG_OVERRIDE_NONE, SEG_OVERRIDE_CS, SEG_OVERRIDE_SS, @@ -470,235 +443,220 @@ typedef enum { SEG_OVERRIDE_FS, SEG_OVERRIDE_GS, SEG_OVERRIDE_max -} SegmentOverride; - -/* - * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field - */ +}; -typedef enum { +/// \brief Possible values for the VEX.m-mmmm field +enum VEXLeadingOpcodeByte { VEX_LOB_0F = 0x1, VEX_LOB_0F38 = 0x2, VEX_LOB_0F3A = 0x3 -} VEXLeadingOpcodeByte; +}; -typedef enum { +enum XOPMapSelect { XOP_MAP_SELECT_8 = 0x8, XOP_MAP_SELECT_9 = 0x9, XOP_MAP_SELECT_A = 0xA -} XOPMapSelect; - -/* - * VEXPrefixCode - Possible values for the VEX.pp/EVEX.pp field - */ +}; -typedef enum { +/// \brief Possible values for the VEX.pp/EVEX.pp field +enum VEXPrefixCode { VEX_PREFIX_NONE = 0x0, VEX_PREFIX_66 = 0x1, VEX_PREFIX_F3 = 0x2, VEX_PREFIX_F2 = 0x3 -} VEXPrefixCode; +}; -typedef enum { +enum VectorExtensionType { TYPE_NO_VEX_XOP = 0x0, TYPE_VEX_2B = 0x1, TYPE_VEX_3B = 0x2, TYPE_EVEX = 0x3, TYPE_XOP = 0x4 -} VectorExtensionType; - -typedef uint8_t BOOL; - -/* - * byteReader_t - Type for the byte reader that the consumer must provide to - * the decoder. Reads a single byte from the instruction's address space. - * @param arg - A baton that the consumer can associate with any internal - * state that it needs. - * @param byte - A pointer to a single byte in memory that should be set to - * contain the value at address. - * @param address - The address in the instruction's address space that should - * be read from. - * @return - -1 if the byte cannot be read for any reason; 0 otherwise. - */ -typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address); - -/* - * dlog_t - Type for the logging function that the consumer can provide to - * get debugging output from the decoder. - * @param arg - A baton that the consumer can associate with any internal - * state that it needs. - * @param log - A string that contains the message. Will be reused after - * the logger returns. - */ -typedef void (*dlog_t)(void* arg, const char *log); - -/* - * The x86 internal instruction, which is produced by the decoder. - */ +}; + +/// \brief Type for the byte reader that the consumer must provide to +/// the decoder. Reads a single byte from the instruction's address space. +/// \param arg A baton that the consumer can associate with any internal +/// state that it needs. +/// \param byte A pointer to a single byte in memory that should be set to +/// contain the value at address. +/// \param address The address in the instruction's address space that should +/// be read from. +/// \return -1 if the byte cannot be read for any reason; 0 otherwise. +typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address); + +/// \brief Type for the logging function that the consumer can provide to +/// get debugging output from the decoder. +/// \param arg A baton that the consumer can associate with any internal +/// state that it needs. +/// \param log A string that contains the message. Will be reused after +/// the logger returns. +typedef void (*dlog_t)(void *arg, const char *log); + +/// The specification for how to extract and interpret a full instruction and +/// its operands. +struct InstructionSpecifier { + uint16_t operands; +}; + +/// The x86 internal instruction, which is produced by the decoder. struct InternalInstruction { - /* Reader interface (C) */ + // Reader interface (C) byteReader_t reader; - /* Opaque value passed to the reader */ + // Opaque value passed to the reader const void* readerArg; - /* The address of the next byte to read via the reader */ + // The address of the next byte to read via the reader uint64_t readerCursor; - /* Logger interface (C) */ + // Logger interface (C) dlog_t dlog; - /* Opaque value passed to the logger */ + // Opaque value passed to the logger void* dlogArg; - /* General instruction information */ + // General instruction information - /* The mode to disassemble for (64-bit, protected, real) */ + // The mode to disassemble for (64-bit, protected, real) DisassemblerMode mode; - /* The start of the instruction, usable with the reader */ + // The start of the instruction, usable with the reader uint64_t startLocation; - /* The length of the instruction, in bytes */ + // The length of the instruction, in bytes size_t length; - /* Prefix state */ + // Prefix state - /* 1 if the prefix byte corresponding to the entry is present; 0 if not */ + // 1 if the prefix byte corresponding to the entry is present; 0 if not uint8_t prefixPresent[0x100]; - /* contains the location (for use with the reader) of the prefix byte */ + // contains the location (for use with the reader) of the prefix byte uint64_t prefixLocations[0x100]; - /* The value of the vector extension prefix(EVEX/VEX/XOP), if present */ + // The value of the vector extension prefix(EVEX/VEX/XOP), if present uint8_t vectorExtensionPrefix[4]; - /* The type of the vector extension prefix */ + // The type of the vector extension prefix VectorExtensionType vectorExtensionType; - /* The value of the REX prefix, if present */ + // The value of the REX prefix, if present uint8_t rexPrefix; - /* The location where a mandatory prefix would have to be (i.e., right before - the opcode, or right before the REX prefix if one is present) */ + // The location where a mandatory prefix would have to be (i.e., right before + // the opcode, or right before the REX prefix if one is present). uint64_t necessaryPrefixLocation; - /* The segment override type */ + // The segment override type SegmentOverride segmentOverride; - /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */ - BOOL xAcquireRelease; + // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease + bool xAcquireRelease; - /* Sizes of various critical pieces of data, in bytes */ + // Sizes of various critical pieces of data, in bytes uint8_t registerSize; uint8_t addressSize; uint8_t displacementSize; uint8_t immediateSize; - /* Offsets from the start of the instruction to the pieces of data, which is - needed to find relocation entries for adding symbolic operands */ + // Offsets from the start of the instruction to the pieces of data, which is + // needed to find relocation entries for adding symbolic operands. uint8_t displacementOffset; uint8_t immediateOffset; - /* opcode state */ + // opcode state - /* The last byte of the opcode, not counting any ModR/M extension */ + // The last byte of the opcode, not counting any ModR/M extension uint8_t opcode; - /* The ModR/M byte of the instruction, if it is an opcode extension */ + // The ModR/M byte of the instruction, if it is an opcode extension uint8_t modRMExtension; - /* decode state */ + // decode state - /* The type of opcode, used for indexing into the array of decode tables */ + // The type of opcode, used for indexing into the array of decode tables OpcodeType opcodeType; - /* The instruction ID, extracted from the decode table */ + // The instruction ID, extracted from the decode table uint16_t instructionID; - /* The specifier for the instruction, from the instruction info table */ - const struct InstructionSpecifier *spec; + // The specifier for the instruction, from the instruction info table + const InstructionSpecifier *spec; - /* state for additional bytes, consumed during operand decode. Pattern: - consumed___ indicates that the byte was already consumed and does not - need to be consumed again */ + // state for additional bytes, consumed during operand decode. Pattern: + // consumed___ indicates that the byte was already consumed and does not + // need to be consumed again. - /* The VEX.vvvv field, which contains a third register operand for some AVX - instructions */ + // The VEX.vvvv field, which contains a third register operand for some AVX + // instructions. Reg vvvv; - /* The writemask for AVX-512 instructions which is contained in EVEX.aaa */ + // The writemask for AVX-512 instructions which is contained in EVEX.aaa Reg writemask; - /* The ModR/M byte, which contains most register operands and some portion of - all memory operands */ - BOOL consumedModRM; + // The ModR/M byte, which contains most register operands and some portion of + // all memory operands. + bool consumedModRM; uint8_t modRM; - /* The SIB byte, used for more complex 32- or 64-bit memory operands */ - BOOL consumedSIB; + // The SIB byte, used for more complex 32- or 64-bit memory operands + bool consumedSIB; uint8_t sib; - /* The displacement, used for memory operands */ - BOOL consumedDisplacement; + // The displacement, used for memory operands + bool consumedDisplacement; int32_t displacement; - /* Immediates. There can be two in some cases */ + // Immediates. There can be two in some cases uint8_t numImmediatesConsumed; uint8_t numImmediatesTranslated; uint64_t immediates[2]; - /* A register or immediate operand encoded into the opcode */ + // A register or immediate operand encoded into the opcode Reg opcodeRegister; - /* Portions of the ModR/M byte */ + // Portions of the ModR/M byte - /* These fields determine the allowable values for the ModR/M fields, which - depend on operand and address widths */ + // These fields determine the allowable values for the ModR/M fields, which + // depend on operand and address widths. EABase eaBaseBase; EABase eaRegBase; Reg regBase; - /* The Mod and R/M fields can encode a base for an effective address, or a - register. These are separated into two fields here */ + // The Mod and R/M fields can encode a base for an effective address, or a + // register. These are separated into two fields here. EABase eaBase; EADisplacement eaDisplacement; - /* The reg field always encodes a register */ + // The reg field always encodes a register Reg reg; - /* SIB state */ + // SIB state SIBIndex sibIndex; uint8_t sibScale; SIBBase sibBase; - const struct OperandSpecifier *operands; + ArrayRef<OperandSpecifier> operands; }; -/* decodeInstruction - Decode one instruction and store the decoding results in - * a buffer provided by the consumer. - * @param insn - The buffer to store the instruction in. Allocated by the - * consumer. - * @param reader - The byteReader_t for the bytes to be read. - * @param readerArg - An argument to pass to the reader for storing context - * specific to the consumer. May be NULL. - * @param logger - The dlog_t to be used in printing status messages from the - * disassembler. May be NULL. - * @param loggerArg - An argument to pass to the logger for storing context - * specific to the logger. May be NULL. - * @param startLoc - The address (in the reader's address space) of the first - * byte in the instruction. - * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in. - * @return - Nonzero if there was an error during decode, 0 otherwise. - */ -int decodeInstruction(struct InternalInstruction* insn, +/// \brief Decode one instruction and store the decoding results in +/// a buffer provided by the consumer. +/// \param insn The buffer to store the instruction in. Allocated by the +/// consumer. +/// \param reader The byteReader_t for the bytes to be read. +/// \param readerArg An argument to pass to the reader for storing context +/// specific to the consumer. May be NULL. +/// \param logger The dlog_t to be used in printing status messages from the +/// disassembler. May be NULL. +/// \param loggerArg An argument to pass to the logger for storing context +/// specific to the logger. May be NULL. +/// \param startLoc The address (in the reader's address space) of the first +/// byte in the instruction. +/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in. +/// \return Nonzero if there was an error during decode, 0 otherwise. +int decodeInstruction(InternalInstruction *insn, byteReader_t reader, - const void* readerArg, + const void *readerArg, dlog_t logger, - void* loggerArg, - const void* miiArg, + void *loggerArg, + const void *miiArg, uint64_t startLoc, DisassemblerMode mode); -/* x86DisassemblerDebug - C-accessible function for printing a message to - * debugs() - * @param file - The name of the file printing the debug message. - * @param line - The line number that printed the debug message. - * @param s - The message to print. - */ +/// \brief Print a message to debugs() +/// \param file The name of the file printing the debug message. +/// \param line The line number that printed the debug message. +/// \param s The message to print. +void Debug(const char *file, unsigned line, const char *s); -void x86DisassemblerDebug(const char *file, - unsigned line, - const char *s); +const char *GetInstrName(unsigned Opcode, const void *mii); -const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii); - -#ifdef __cplusplus -} -#endif +} // namespace X86Disassembler +} // namespace llvm #endif diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 523ae99..f59e0b6 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -1,29 +1,27 @@ -/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===* - * - * The LLVM Compiler Infrastructure - * - * This file is distributed under the University of Illinois Open Source - * License. See LICENSE.TXT for details. - * - *===----------------------------------------------------------------------===* - * - * This file is part of the X86 Disassembler. - * It contains common definitions used by both the disassembler and the table - * generator. - * Documentation for the disassembler can be found in X86Disassembler.h. - * - *===----------------------------------------------------------------------===*/ - -/* - * This header file provides those definitions that need to be shared between - * the decoder and the table generator in a C-friendly manner. - */ +//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains common definitions used by both the disassembler and the table +// generator. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// #ifndef X86DISASSEMBLERDECODERCOMMON_H #define X86DISASSEMBLERDECODERCOMMON_H #include "llvm/Support/DataTypes.h" +namespace llvm { +namespace X86Disassembler { + #define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers #define CONTEXTS_SYM x86DisassemblerContexts #define ONEBYTE_SYM x86DisassemblerOneByteOpcodes @@ -44,11 +42,9 @@ #define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes" #define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes" -/* - * Attributes of an instruction that must be known before the opcode can be - * processed correctly. Most of these indicate the presence of particular - * prefixes, but ATTR_64BIT is simply an attribute of the decoding context. - */ +// Attributes of an instruction that must be known before the opcode can be +// processed correctly. Most of these indicate the presence of particular +// prefixes, but ATTR_64BIT is simply an attribute of the decoding context. #define ATTRIBUTE_BITS \ ENUM_ENTRY(ATTR_NONE, 0x00) \ ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \ @@ -73,13 +69,11 @@ enum attributeBits { }; #undef ENUM_ENTRY -/* - * Combinations of the above attributes that are relevant to instruction - * decode. Although other combinations are possible, they can be reduced to - * these without affecting the ultimately decoded instruction. - */ +// Combinations of the above attributes that are relevant to instruction +// decode. Although other combinations are possible, they can be reduced to +// these without affecting the ultimately decoded instruction. -/* Class name Rank Rationale for rank assignment */ +// Class name Rank Rationale for rank assignment #define INSTRUCTION_CONTEXTS \ ENUM_ENTRY(IC, 0, "says nothing about the instruction") \ ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \ @@ -274,17 +268,15 @@ enum attributeBits { ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, -typedef enum { +enum InstructionContext { INSTRUCTION_CONTEXTS IC_max -} InstructionContext; +}; #undef ENUM_ENTRY -/* - * Opcode types, which determine which decode table to use, both in the Intel - * manual and also for the decoder. - */ -typedef enum { +// Opcode types, which determine which decode table to use, both in the Intel +// manual and also for the decoder. +enum OpcodeType { ONEBYTE = 0, TWOBYTE = 1, THREEBYTE_38 = 2, @@ -292,39 +284,33 @@ typedef enum { XOP8_MAP = 4, XOP9_MAP = 5, XOPA_MAP = 6 -} OpcodeType; - -/* - * The following structs are used for the hierarchical decode table. After - * determining the instruction's class (i.e., which IC_* constant applies to - * it), the decoder reads the opcode. Some instructions require specific - * values of the ModR/M byte, so the ModR/M byte indexes into the final table. - * - * If a ModR/M byte is not required, "required" is left unset, and the values - * for each instructionID are identical. - */ +}; +// The following structs are used for the hierarchical decode table. After +// determining the instruction's class (i.e., which IC_* constant applies to +// it), the decoder reads the opcode. Some instructions require specific +// values of the ModR/M byte, so the ModR/M byte indexes into the final table. +// +// If a ModR/M byte is not required, "required" is left unset, and the values +// for each instructionID are identical. typedef uint16_t InstrUID; -/* - * ModRMDecisionType - describes the type of ModR/M decision, allowing the - * consumer to determine the number of entries in it. - * - * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded - * instruction is the same. - * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode - * corresponds to one instruction; otherwise, it corresponds to - * a different instruction. - * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte - * divided by 8 is used to select instruction; otherwise, each - * value of the ModR/M byte could correspond to a different - * instruction. - * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This - corresponds to instructions that use reg field as opcode - * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond - * to a different instruction. - */ - +// ModRMDecisionType - describes the type of ModR/M decision, allowing the +// consumer to determine the number of entries in it. +// +// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded +// instruction is the same. +// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode +// corresponds to one instruction; otherwise, it corresponds to +// a different instruction. +// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte +// divided by 8 is used to select instruction; otherwise, each +// value of the ModR/M byte could correspond to a different +// instruction. +// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This +// corresponds to instructions that use reg field as opcode +// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond +// to a different instruction. #define MODRMTYPES \ ENUM_ENTRY(MODRM_ONEENTRY) \ ENUM_ENTRY(MODRM_SPLITRM) \ @@ -333,47 +319,13 @@ typedef uint16_t InstrUID; ENUM_ENTRY(MODRM_FULL) #define ENUM_ENTRY(n) n, -typedef enum { +enum ModRMDecisionType { MODRMTYPES MODRM_max -} ModRMDecisionType; -#undef ENUM_ENTRY - -/* - * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which - * instruction each possible value of the ModR/M byte corresponds to. Once - * this information is known, we have narrowed down to a single instruction. - */ -struct ModRMDecision { - uint8_t modrm_type; - - /* The macro below must be defined wherever this file is included. */ - INSTRUCTION_IDS -}; - -/* - * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at - * given a particular opcode. - */ -struct OpcodeDecision { - struct ModRMDecision modRMDecisions[256]; -}; - -/* - * ContextDecision - Specifies which opcode->instruction tables to look at given - * a particular context (set of attributes). Since there are many possible - * contexts, the decoder first uses CONTEXTS_SYM to determine which context - * applies given a specific set of attributes. Hence there are only IC_max - * entries in this table, rather than 2^(ATTR_max). - */ -struct ContextDecision { - struct OpcodeDecision opcodeDecisions[IC_max]; }; +#undef ENUM_ENTRY -/* - * Physical encodings of instruction operands. - */ - +// Physical encodings of instruction operands. #define ENCODINGS \ ENUM_ENTRY(ENCODING_NONE, "") \ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ @@ -408,16 +360,13 @@ struct ContextDecision { ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes") #define ENUM_ENTRY(n, d) n, - typedef enum { - ENCODINGS - ENCODING_max - } OperandEncoding; +enum OperandEncoding { + ENCODINGS + ENCODING_max +}; #undef ENUM_ENTRY -/* - * Semantic interpretations of instruction operands. - */ - +// Semantic interpretations of instruction operands. #define TYPES \ ENUM_ENTRY(TYPE_NONE, "") \ ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \ @@ -508,56 +457,42 @@ struct ContextDecision { ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state") #define ENUM_ENTRY(n, d) n, -typedef enum { +enum OperandType { TYPES TYPE_max -} OperandType; +}; #undef ENUM_ENTRY -/* - * OperandSpecifier - The specification for how to extract and interpret one - * operand. - */ +/// \brief The specification for how to extract and interpret one operand. struct OperandSpecifier { uint8_t encoding; uint8_t type; }; -/* - * Indicates where the opcode modifier (if any) is to be found. Extended - * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. - */ - +// Indicates where the opcode modifier (if any) is to be found. Extended +// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. #define MODIFIER_TYPES \ ENUM_ENTRY(MODIFIER_NONE) #define ENUM_ENTRY(n) n, -typedef enum { +enum ModifierType { MODIFIER_TYPES MODIFIER_max -} ModifierType; +}; #undef ENUM_ENTRY -#define X86_MAX_OPERANDS 5 - -/* - * The specification for how to extract and interpret a full instruction and - * its operands. - */ -struct InstructionSpecifier { - /* The macro below must be defined wherever this file is included. */ - INSTRUCTION_SPECIFIER_FIELDS -}; +static const unsigned X86_MAX_OPERANDS = 5; -/* - * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode - * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, - * respectively. - */ -typedef enum { +/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode +/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, +/// respectively. +enum DisassemblerMode { MODE_16BIT, MODE_32BIT, MODE_64BIT -} DisassemblerMode; +}; + +} // namespace X86Disassembler +} // namespace llvm #endif diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index eea0a76..b45b118 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "asm-printer" #include "X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" @@ -28,6 +27,8 @@ #include <map> using namespace llvm; +#define DEBUG_TYPE "asm-printer" + // Include the auto-generated portion of the assembly writer. #define PRINT_ALIAS_INSTR #include "X86GenAsmWriter.inc" diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index f34e633..531183b 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -32,6 +32,8 @@ public: // Autogenerated by tblgen, returns true if we successfully printed an // alias. bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &OS); diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index db61fb0..baf6507 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -32,7 +32,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)) { // If this is a shuffle operation, the switch should fill in this state. SmallVector<int, 8> ShuffleMask; - const char *DestName = 0, *Src1Name = 0, *Src2Name = 0; + const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; switch (MI->getOpcode()) { case X86::INSERTPSrr: @@ -492,7 +492,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // If this was a shuffle operation, print the shuffle mask. if (!ShuffleMask.empty()) { - if (DestName == 0) DestName = Src1Name; + if (!DestName) DestName = Src1Name; OS << (DestName ? DestName : "mem") << " = "; // If the two sources are the same, canonicalize the input elements to be diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 1c95d37..1c8466b 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "asm-printer" #include "X86IntelInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" @@ -25,6 +24,8 @@ #include <cctype> using namespace llvm; +#define DEBUG_TYPE "asm-printer" + #include "X86GenAsmWriter1.inc" void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk index ee37c27..a3c9bc8 100644 --- a/lib/Target/X86/MCTargetDesc/Android.mk +++ b/lib/Target/X86/MCTargetDesc/Android.mk @@ -14,7 +14,8 @@ x86_mc_desc_SRC_FILES := \ X86MCCodeEmitter.cpp \ X86MachORelocationInfo.cpp \ X86MachObjectWriter.cpp \ - X86WinCOFFObjectWriter.cpp + X86WinCOFFObjectWriter.cpp \ + X86WinCOFFStreamer.cpp # For the host # ===================================================== diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt index 3f5a0e2..129c28d 100644 --- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc X86MCCodeEmitter.cpp X86MachObjectWriter.cpp X86ELFObjectWriter.cpp + X86WinCOFFStreamer.cpp X86WinCOFFObjectWriter.cpp X86MachORelocationInfo.cpp X86ELFRelocationInfo.cpp diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt index 9e1d29c..146d111 100644 --- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86Desc parent = X86 -required_libraries = MC Support X86AsmPrinter X86Info +required_libraries = MC Object Support X86AsmPrinter X86Info add_to_library_groups = X86 diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 23763f7..bf30a8e 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -37,23 +37,29 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { - default: llvm_unreachable("invalid fixup kind!"); + default: + llvm_unreachable("invalid fixup kind!"); case FK_PCRel_1: case FK_SecRel_1: - case FK_Data_1: return 0; + case FK_Data_1: + return 0; case FK_PCRel_2: case FK_SecRel_2: - case FK_Data_2: return 1; + case FK_Data_2: + return 1; case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_movq_load: case X86::reloc_signed_4byte: case X86::reloc_global_offset_table: case FK_SecRel_4: - case FK_Data_4: return 2; + case FK_Data_4: + return 2; case FK_PCRel_8: case FK_SecRel_8: - case FK_Data_8: return 3; + case FK_Data_8: + case X86::reloc_global_offset_table8: + return 3; } } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 38fab15..6aeb1f2 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -643,6 +643,10 @@ namespace X86II { /// counted as one operand. /// inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); + switch (TSFlags & X86II::FormMask) { default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); case X86II::Pseudo: @@ -660,9 +664,6 @@ namespace X86II { case X86II::MRMDestMem: return 0; case X86II::MRMSrcMem: { - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). @@ -690,6 +691,8 @@ namespace X86II { unsigned FirstMemOp = 0; if (HasVEX_4V) ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). + if (HasEVEX_K) + ++FirstMemOp;// Skip the mask register return FirstMemOp; } case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index c44d88d..3fdec87 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -43,7 +43,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, bool IsPCRel) const { // determine the type of the relocation - MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant(); + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); unsigned Type; if (getEMachine() == ELF::EM_X86_64) { if (IsPCRel) { @@ -98,6 +98,12 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, } else { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); + case X86::reloc_global_offset_table8: + Type = ELF::R_X86_64_GOTPC64; + break; + case X86::reloc_global_offset_table: + Type = ELF::R_X86_64_GOTPC32; + break; case FK_Data_8: switch (Modifier) { default: diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index 4fa519c..b679316 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -39,7 +39,7 @@ public: if (Sym->isVariable() == false) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); - const MCExpr *Expr = 0; + const MCExpr *Expr = nullptr; // If hasAddend is true, then we need to add Addend (r_addend) to Expr. bool hasAddend = false; diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index f2e34cb..09396b7 100644 --- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -23,6 +23,7 @@ enum Fixups { reloc_global_offset_table, // 32-bit, relative to the start // of the instruction. Used only // for _GLOBAL_OFFSET_TABLE_. + reloc_global_offset_table8, // 64-bit variant. // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 6561804..39480ea 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -51,7 +51,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { TextAlignFillValue = 0x90; if (!is64Bit) - Data64bitsDirective = 0; // we can't emit a 64-bit unit + Data64bitsDirective = nullptr; // we can't emit a 64-bit unit // Use ## as a comment string so that .s files generated by llvm can go // through the GCC preprocessor without causing an error. This is needed @@ -115,7 +115,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { // into two .words. if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) && T.getArch() == Triple::x86) - Data64bitsDirective = 0; + Data64bitsDirective = nullptr; // Always enable the integrated assembler by default. // Clang also enabled it when the OS is Solaris but that is redundant here. @@ -157,8 +157,10 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { void X86MCAsmInfoGNUCOFF::anchor() { } X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { - if (Triple.getArch() == Triple::x86_64) + if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PointerSize = 8; + } AssemblerDialect = AsmWriterFlavor; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index e6fb037..2152b21 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "mccodeemitter" #include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" @@ -27,6 +26,8 @@ using namespace llvm; +#define DEBUG_TYPE "mccodeemitter" + namespace { class X86MCCodeEmitter : public MCCodeEmitter { X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; @@ -285,7 +286,7 @@ enum GlobalOffsetTableExprKind { }; static GlobalOffsetTableExprKind StartsWithGlobalOffsetTable(const MCExpr *Expr) { - const MCExpr *RHS = 0; + const MCExpr *RHS = nullptr; if (Expr->getKind() == MCExpr::Binary) { const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); Expr = BE->getLHS(); @@ -316,7 +317,7 @@ void X86MCCodeEmitter:: EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { - const MCExpr *Expr = NULL; + const MCExpr *Expr = nullptr; if (DispOp.isImm()) { // If this is a simple integer displacement that doesn't require a // relocation, emit it now. @@ -339,7 +340,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, if (Kind != GOT_None) { assert(ImmOffset == 0); - FixupKind = MCFixupKind(X86::reloc_global_offset_table); + if (Size == 8) { + FixupKind = MCFixupKind(X86::reloc_global_offset_table8); + } else { + assert(Size == 4); + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + } + if (Kind == GOT_Normal) ImmOffset = CurByte; } else if (Expr->getKind() == MCExpr::SymbolRef) { @@ -1421,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM6r: case X86II::MRM7r: { if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; + if (HasEVEX_K) // Skip writemask + ++CurOp; EmitByte(BaseOpcode, CurByte, OS); uint64_t Form = TSFlags & X86II::FormMask; EmitRegModRMByte(MI.getOperand(CurOp++), @@ -1436,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM6m: case X86II::MRM7m: { if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; + if (HasEVEX_K) // Skip writemask + ++CurOp; EmitByte(BaseOpcode, CurByte, OS); uint64_t Form = TSFlags & X86II::FormMask; EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m, diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 09fdb9c..e63036c 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -27,6 +27,12 @@ #include "llvm/Support/Host.h" #include "llvm/Support/TargetRegistry.h" +#if _MSC_VER +#include <intrin.h> +#endif + +using namespace llvm; + #define GET_REGINFO_MC_DESC #include "X86GenRegisterInfo.inc" @@ -36,13 +42,6 @@ #define GET_SUBTARGETINFO_MC_DESC #include "X86GenSubtargetInfo.inc" -#if _MSC_VER -#include <intrin.h> -#endif - -using namespace llvm; - - std::string X86_MC::ParseX86Triple(StringRef TT) { Triple TheTriple(TT); std::string FS; @@ -230,14 +229,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, } std::string CPUName = CPU; - if (CPUName.empty()) { -#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ - || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - CPUName = sys::getHostCPUName(); -#else + if (CPUName.empty()) CPUName = "generic"; -#endif - } MCSubtargetInfo *X = new MCSubtargetInfo(); InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS); @@ -294,13 +287,13 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { // Initial state of the frame pointer is esp+stackGrowth. unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP; MCCFIInstruction Inst = MCCFIInstruction::createDefCfa( - 0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); + nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); MAI->addInitialFrameState(Inst); // Add return address to move list unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP; MCCFIInstruction Inst2 = MCCFIInstruction::createOffset( - 0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth); + nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth); MAI->addInitialFrameState(Inst2); return MAI; @@ -365,13 +358,16 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, bool NoExecStack) { Triple TheTriple(TT); - if (TheTriple.isOSBinFormatMachO()) + switch (TheTriple.getObjectFormat()) { + default: llvm_unreachable("unsupported object format"); + case Triple::MachO: return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - - if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) - return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll); - - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); + case Triple::COFF: + assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); + return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); + case Triple::ELF: + return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); + } } static MCInstPrinter *createX86MCInstPrinter(const Target &T, @@ -384,7 +380,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T, return new X86ATTInstPrinter(MAI, MII, MRI); if (SyntaxVariant == 1) return new X86IntelInstPrinter(MAI, MII, MRI); - return 0; + return nullptr; } static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT, diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 41ae435..8fe40fd 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -26,6 +26,7 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class MCRelocationInfo; +class MCStreamer; class Target; class StringRef; class raw_ostream; @@ -84,6 +85,14 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); +/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code +/// streamer which will generate PE/COFF format object files. +/// +/// Takes ownership of \p AB and \p CE. +MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + MCCodeEmitter *CE, raw_ostream &OS, + bool RelaxAll); + /// createX86MachObjectWriter - Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, bool Is64Bit, diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index f2023e3..3b81d53 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -40,7 +40,7 @@ public: // FIXME: check that the value is actually the same. if (Sym->isVariable() == false) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); - const MCExpr *Expr = 0; + const MCExpr *Expr = nullptr; switch(RelType) { case X86_64_RELOC_TLV: diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 1a35ced..ead3338 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -146,13 +146,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, const MCSymbol *A = &Target.getSymA()->getSymbol(); if (A->isTemporary()) A = &A->AliasedSymbol(); - MCSymbolData &A_SD = Asm.getSymbolData(*A); + const MCSymbolData &A_SD = Asm.getSymbolData(*A); const MCSymbolData *A_Base = Asm.getAtom(&A_SD); const MCSymbol *B = &Target.getSymB()->getSymbol(); if (B->isTemporary()) B = &B->AliasedSymbol(); - MCSymbolData &B_SD = Asm.getSymbolData(*B); + const MCSymbolData &B_SD = Asm.getSymbolData(*B); const MCSymbolData *B_Base = Asm.getAtom(&B_SD); // Neither symbol can be modified. @@ -186,9 +186,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, false); Value += Writer->getSymbolAddress(&A_SD, Layout) - - (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout)); + (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout)); Value -= Writer->getSymbolAddress(&B_SD, Layout) - - (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout)); + (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout)); if (A_Base) { Index = A_Base->getIndex(); @@ -220,7 +220,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, Type = MachO::X86_64_RELOC_SUBTRACTOR; } else { const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); - MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData &SD = Asm.getSymbolData(*Symbol); const MCSymbolData *Base = Asm.getAtom(&SD); // Relocations inside debug sections always use local relocations when @@ -231,7 +231,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, const MCSectionMachO &Section = static_cast<const MCSectionMachO&>( Fragment->getParent()->getSection()); if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) - Base = 0; + Base = nullptr; } // x86_64 almost always uses external relocations, except when there is no @@ -369,7 +369,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - MCSymbolData *A_SD = &Asm.getSymbolData(*A); + const MCSymbolData *A_SD = &Asm.getSymbolData(*A); if (!A_SD->getFragment()) report_fatal_error("symbol '" + A->getName() + @@ -382,7 +382,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, uint32_t Value2 = 0; if (const MCSymbolRefExpr *B = Target.getSymB()) { - MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); if (!B_SD->getFragment()) report_fatal_error("symbol '" + B->getSymbol().getName() + @@ -465,7 +465,7 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, unsigned IsPCRel = 0; // Get the symbol data. - MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); unsigned Index = SD_A->getIndex(); // We're only going to have a second symbol in pic mode and it'll be a @@ -476,7 +476,8 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, // If this is a subtraction then we're pcrel. uint32_t FixupAddress = Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); - MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol()); + const MCSymbolData *SD_B = + &Asm.getSymbolData(Target.getSymB()->getSymbol()); IsPCRel = 1; FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) + Target.getConstant()); @@ -524,7 +525,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, } // Get the symbol data, if any. - MCSymbolData *SD = 0; + const MCSymbolData *SD = nullptr; if (Target.getSymA()) SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index ffc9e8d..40af822 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -23,10 +23,8 @@ namespace llvm { namespace { class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { - const bool Is64Bit; - public: - X86WinCOFFObjectWriter(bool Is64Bit_); + X86WinCOFFObjectWriter(bool Is64Bit); virtual ~X86WinCOFFObjectWriter(); unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, @@ -34,10 +32,9 @@ namespace { }; } -X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_) - : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 : - COFF::IMAGE_FILE_MACHINE_I386), - Is64Bit(Is64Bit_) {} +X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) + : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 + : COFF::IMAGE_FILE_MACHINE_I386) {} X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} @@ -49,29 +46,46 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); - switch (FixupKind) { - case FK_PCRel_4: - case X86::reloc_riprel_4byte: - case X86::reloc_riprel_4byte_movq_load: - return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32; - case FK_Data_4: - case X86::reloc_signed_4byte: - if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) - return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB : - COFF::IMAGE_REL_I386_DIR32NB; - return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32; - case FK_Data_8: - if (Is64Bit) + if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) { + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return COFF::IMAGE_REL_AMD64_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return COFF::IMAGE_REL_AMD64_ADDR32NB; + return COFF::IMAGE_REL_AMD64_ADDR32; + case FK_Data_8: return COFF::IMAGE_REL_AMD64_ADDR64; - llvm_unreachable("unsupported relocation type"); - case FK_SecRel_2: - return Is64Bit ? COFF::IMAGE_REL_AMD64_SECTION - : COFF::IMAGE_REL_I386_SECTION; - case FK_SecRel_4: - return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL; - default: - llvm_unreachable("unsupported relocation type"); - } + case FK_SecRel_2: + return COFF::IMAGE_REL_AMD64_SECTION; + case FK_SecRel_4: + return COFF::IMAGE_REL_AMD64_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } + } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) { + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return COFF::IMAGE_REL_I386_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return COFF::IMAGE_REL_I386_DIR32NB; + return COFF::IMAGE_REL_I386_DIR32; + case FK_SecRel_2: + return COFF::IMAGE_REL_I386_SECTION; + case FK_SecRel_4: + return COFF::IMAGE_REL_I386_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } + } else + llvm_unreachable("Unsupported COFF machine type."); } MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS, diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp new file mode 100644 index 0000000..c62fd0a --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -0,0 +1,51 @@ +//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "llvm/MC/MCWinCOFFStreamer.h" + +using namespace llvm; + +namespace { +class X86WinCOFFStreamer : public MCWinCOFFStreamer { +public: + X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, + raw_ostream &OS) + : MCWinCOFFStreamer(C, AB, *CE, OS) { } + + void EmitWin64EHHandlerData() override; + void FinishImpl() override; +}; + +void X86WinCOFFStreamer::EmitWin64EHHandlerData() { + MCStreamer::EmitWin64EHHandlerData(); + + // We have to emit the unwind info now, because this directive + // actually switches to the .xdata section! + MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo()); +} + +void X86WinCOFFStreamer::FinishImpl() { + EmitFrames(nullptr); + EmitW64Tables(); + + MCWinCOFFStreamer::FinishImpl(); +} +} + +namespace llvm { +MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + MCCodeEmitter *CE, raw_ostream &OS, + bool RelaxAll) { + X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); + S->getAssembler().setRelaxAll(RelaxAll); + return S; +} +} + diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 18e6845..64e8ea8 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -30,9 +30,9 @@ class X86TargetMachine; FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); -/// createGlobalBaseRegPass - This pass initializes a global base +/// createX86GlobalBaseRegPass - This pass initializes a global base /// register for PIC on x86-32. -FunctionPass* createGlobalBaseRegPass(); +FunctionPass* createX86GlobalBaseRegPass(); /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses /// to local-dynamic TLS variables so that the TLS base address for the module diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 78edcf0..6912b57 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -166,6 +166,8 @@ def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "Call register indirect">; def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; +def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", + "LEA instruction with certain arguments is slow">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -195,8 +197,7 @@ def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem]>; + // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, [FeatureSSE3, FeatureSlowBTMem]>; @@ -227,6 +228,7 @@ def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, FeaturePCLMUL, FeatureAES, FeatureCallRegIndirect, FeaturePRFCHW, + FeatureSlowLEA, FeatureSlowBTMem, FeatureFastUAMem]>; // "Arrandale" along with corei3 and corei5 def : ProcessorModel<"corei7", SandyBridgeModel, @@ -329,6 +331,13 @@ def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeaturePOPCNT, FeatureBMI, FeatureTBM, FeatureFMA, FeatureFSGSBase]>; +// Excavator +def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, + FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, + FeaturePCLMUL, FeatureF16C, FeatureLZCNT, + FeaturePOPCNT, FeatureBMI, FeatureBMI2, + FeatureTBM, FeatureFMA, FeatureFSGSBase]>; + def : Proc<"geode", [Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; @@ -336,6 +345,20 @@ def : Proc<"winchip2", [Feature3DNow]>; def : Proc<"c3", [Feature3DNow]>; def : Proc<"c3-2", [FeatureSSE1]>; +// We also provide a generic 64-bit specific x86 processor model which tries to +// be good for modern chips without enabling instruction set encodings past the +// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and +// modern 64-bit x86 chip, and enables features that are generally beneficial. +// +// We currently use the Sandy Bridge model as the default scheduling model as +// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which +// covers a huge swath of x86 processors. If there are specific scheduling +// knobs which need to be tuned differently for AMD chips, we might consider +// forming a common base for them. +def : ProcessorModel<"x86-64", SandyBridgeModel, + [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index fb66acc..1dca568 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -15,7 +15,6 @@ #include "X86AsmPrinter.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" -#include "X86COFFMachineModuleInfo.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" @@ -102,7 +101,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); - if (StubSym.getPointer() == 0) + if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ @@ -110,14 +109,14 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, MachineModuleInfoImpl::StubValueTy &StubSym = P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry( Sym); - if (StubSym.getPointer() == 0) + if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub"); MachineModuleInfoImpl::StubValueTy &StubSym = P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); - if (StubSym.getPointer() == 0) + if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); } @@ -174,7 +173,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0, unsigned AsmVariant = 0); + const char *Modifier = nullptr, unsigned AsmVariant = 0); /// printPCRelImm - This is used to print an immediate value that ends up /// being encoded as a pc-relative value. These print slightly differently, for @@ -232,7 +231,7 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, unsigned Op, raw_ostream &O, - const char *Modifier = NULL) { + const char *Modifier = nullptr) { const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); @@ -284,7 +283,7 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI, unsigned Op, raw_ostream &O, - const char *Modifier = NULL) { + const char *Modifier = nullptr) { assert(isMem(MI, Op) && "Invalid memory reference!"); const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg); if (Segment.getReg()) { @@ -296,7 +295,7 @@ static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI, static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, unsigned Op, raw_ostream &O, - const char *Modifier = NULL, + const char *Modifier = nullptr, unsigned AsmVariant = 1) { const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); @@ -464,7 +463,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } } - printOperand(*this, MI, OpNo, O, /*Modifier*/ 0, AsmVariant); + printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant); return false; } @@ -527,6 +526,55 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { } } +static void +emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, + MachineModuleInfoImpl::StubValueTy &MCSym) { + // L_foo$stub: + OutStreamer.EmitLabel(StubLabel); + // .indirect_symbol _foo + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs; however, sometimes the types are local to the file. + // We need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue( + MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()), + 4 /*size*/); +} + +void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { + SmallString<128> Directive; + raw_svector_ostream OS(Directive); + StringRef Name = Sym->getName(); + + if (Subtarget->isTargetKnownWindowsMSVC()) + OS << " /EXPORT:"; + else + OS << " -export:"; + + if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) && + (Name[0] == getDataLayout().getGlobalPrefix())) + Name = Name.drop_front(); + + OS << Name; + + if (IsData) { + if (Subtarget->isTargetKnownWindowsMSVC()) + OS << ",DATA"; + else + OS << ",data"; + } + + OS.flush(); + OutStreamer.EmitBytes(Directive); +} void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { if (Subtarget->isTargetMacho()) { @@ -547,11 +595,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { 5, SectionKind::getMetadata()); OutStreamer.SwitchSection(TheSection); - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + for (const auto &Stub : Stubs) { // L_foo$stub: - OutStreamer.EmitLabel(Stubs[i].first); + OutStreamer.EmitLabel(Stub.first); // .indirect_symbol _foo - OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(), + OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(), MCSA_IndirectSymbol); // hlt; hlt; hlt; hlt; hlt hlt = 0xf4. const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4"; @@ -571,44 +619,24 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { SectionKind::getMetadata()); OutStreamer.SwitchSection(TheSection); - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$non_lazy_ptr: - OutStreamer.EmitLabel(Stubs[i].first); - // .indirect_symbol _foo - MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; - OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), - MCSA_IndirectSymbol); - // .long 0 - if (MCSym.getInt()) - // External to current translation unit. - OutStreamer.EmitIntValue(0, 4/*size*/); - else - // Internal to current translation unit. - // - // When we place the LSDA into the TEXT section, the type info - // pointers need to be indirect and pc-rel. We accomplish this by - // using NLPs. However, sometimes the types are local to the file. So - // we need to fill in the value for the NLP in those cases. - OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), - OutContext), 4/*size*/); - } + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second); + Stubs.clear(); OutStreamer.AddBlankLine(); } Stubs = MMIMacho.GetHiddenGVStubList(); if (!Stubs.empty()) { - OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); - EmitAlignment(2); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$non_lazy_ptr: - OutStreamer.EmitLabel(Stubs[i].first); - // .long _foo - OutStreamer.EmitValue(MCSymbolRefExpr:: - Create(Stubs[i].second.getPointer(), - OutContext), 4/*size*/); - } + const MCSection *TheSection = + OutContext.getMachOSection("__IMPORT", "__pointers", + MachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer.SwitchSection(TheSection); + + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second); + Stubs.clear(); OutStreamer.AddBlankLine(); } @@ -630,46 +658,25 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } if (Subtarget->isTargetCOFF()) { - X86COFFMachineModuleInfo &COFFMMI = - MMI->getObjFileInfo<X86COFFMachineModuleInfo>(); - - // Emit type information for external functions - typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator; - for (externals_iterator I = COFFMMI.externals_begin(), - E = COFFMMI.externals_end(); - I != E; ++I) { - OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL); - OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION - << COFF::SCT_COMPLEX_TYPE_SHIFT); - OutStreamer.EndCOFFSymbolDef(); - } - // Necessary for dllexport support std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; - for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) - if (I->hasDLLExportStorageClass()) - DLLExportedFns.push_back(getSymbol(I)); + for (const auto &Function : M) + if (Function.hasDLLExportStorageClass()) + DLLExportedFns.push_back(getSymbol(&Function)); - for (Module::const_global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) - if (I->hasDLLExportStorageClass()) - DLLExportedGlobals.push_back(getSymbol(I)); + for (const auto &Global : M.globals()) + if (Global.hasDLLExportStorageClass()) + DLLExportedGlobals.push_back(getSymbol(&Global)); - for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) { - const GlobalValue *GV = I; - if (!GV->hasDLLExportStorageClass()) + for (const auto &Alias : M.aliases()) { + if (!Alias.hasDLLExportStorageClass()) continue; - while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV)) - GV = A->getAliasedGlobal(); - - if (isa<Function>(GV)) - DLLExportedFns.push_back(getSymbol(I)); - else if (isa<GlobalVariable>(GV)) - DLLExportedGlobals.push_back(getSymbol(I)); + if (Alias.getType()->getElementType()->isFunctionTy()) + DLLExportedFns.push_back(getSymbol(&Alias)); + else + DLLExportedGlobals.push_back(getSymbol(&Alias)); } // Output linker support code for dllexported globals on windows. @@ -678,28 +685,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection()); - SmallString<128> name; - for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) { - if (Subtarget->isTargetKnownWindowsMSVC()) - name = " /EXPORT:"; - else - name = " -export:"; - name += DLLExportedGlobals[i]->getName(); - if (Subtarget->isTargetKnownWindowsMSVC()) - name += ",DATA"; - else - name += ",data"; - OutStreamer.EmitBytes(name); - } - for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) { - if (Subtarget->isTargetKnownWindowsMSVC()) - name = " /EXPORT:"; - else - name = " -export:"; - name += DLLExportedFns[i]->getName(); - OutStreamer.EmitBytes(name); - } + for (auto & Symbol : DLLExportedGlobals) + GenerateExportDirective(Symbol, /*IsData=*/true); + for (auto & Symbol : DLLExportedFns) + GenerateExportDirective(Symbol, /*IsData=*/false); } } @@ -715,9 +705,9 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); const DataLayout *TD = TM.getDataLayout(); - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - OutStreamer.EmitLabel(Stubs[i].first); - OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), + for (const auto &Stub : Stubs) { + OutStreamer.EmitLabel(Stub.first); + OutStreamer.EmitSymbolValue(Stub.second.getPointer(), TD->getPointerSize()); } Stubs.clear(); diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 3308cc2..e4eef5d 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -16,13 +16,15 @@ #include "llvm/Target/TargetMachine.h" namespace llvm { - class MCStreamer; +class MCSymbol; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget; StackMaps SM; + void GenerateExportDirective(const MCSymbol *Sym, bool IsData); + public: explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) : AsmPrinter(TM, Streamer), SM(*this) { diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp deleted file mode 100644 index 6a6125b..0000000 --- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp +++ /dev/null @@ -1,19 +0,0 @@ -//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This is an MMI implementation for X86 COFF (windows) targets. -// -//===----------------------------------------------------------------------===// - -#include "X86COFFMachineModuleInfo.h" -using namespace llvm; - - -X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() { -} diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h deleted file mode 100644 index 0dfeb42..0000000 --- a/lib/Target/X86/X86COFFMachineModuleInfo.h +++ /dev/null @@ -1,46 +0,0 @@ -//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This is an MMI implementation for X86 COFF (windows) targets. -// -//===----------------------------------------------------------------------===// - -#ifndef X86COFF_MACHINEMODULEINFO_H -#define X86COFF_MACHINEMODULEINFO_H - -#include "X86MachineFunctionInfo.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/CodeGen/MachineModuleInfo.h" - -namespace llvm { - class X86MachineFunctionInfo; - class DataLayout; - -/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation -/// for X86 COFF targets. -class X86COFFMachineModuleInfo : public MachineModuleInfoImpl { - DenseSet<MCSymbol const *> Externals; -public: - X86COFFMachineModuleInfo(const MachineModuleInfo &) {} - virtual ~X86COFFMachineModuleInfo(); - - void addExternalFunction(MCSymbol* Symbol) { - Externals.insert(Symbol); - } - - typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator; - externals_iterator externals_begin() const { return Externals.begin(); } - externals_iterator externals_end() const { return Externals.end(); } -}; - - - -} // end namespace llvm - -#endif diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h index 040da35..e76f9fd 100644 --- a/lib/Target/X86/X86CallingConv.h +++ b/lib/Target/X86/X86CallingConv.h @@ -29,33 +29,6 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } -inline bool CC_X86_CDeclMethod_SRet(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { - // Swap the order of the first two parameters if the first parameter is sret. - if (ArgFlags.isSRet()) { - assert(ValNo == 0); - assert(ValVT == MVT::i32); - State.AllocateStack(8, 4); - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 4, LocVT, LocInfo)); - - // Indicate that we need to swap the order of the first and second - // parameters by "allocating" register zero. There are no register - // parameters with cdecl methods, so we can use this to communicate to the - // next call. - State.AllocateReg(1); - return true; - } else if (ValNo == 1 && State.isAllocated(1)) { - assert(ValVT == MVT::i32 && "non-i32-sized this param unsupported"); - // Stack was already allocated while processing sret. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 0, LocVT, LocInfo)); - return true; - } - - // All other args use the C calling convention. - return false; -} - } // End llvm namespace #endif diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 1cfd827..0824d4e 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -485,15 +485,6 @@ def CC_X86_32_ThisCall_Win : CallingConv<[ CCDelegateTo<CC_X86_32_ThisCall_Common> ]>; -def CC_X86_CDeclMethod : CallingConv<[ - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, - - CCCustom<"CC_X86_CDeclMethod_SRet">, - - CCDelegateTo<CC_X86_32_Common> -]>; - def CC_X86_32_ThisCall : CallingConv<[ CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>, CCDelegateTo<CC_X86_32_ThisCall_Win> @@ -583,7 +574,6 @@ def CC_Intel_OCL_BI : CallingConv<[ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, - CCIfCC<"CallingConv::X86_CDeclMethod", CCDelegateTo<CC_X86_CDeclMethod>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index f6c4c2e..76718d0 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-emitter" #include "X86.h" #include "X86InstrInfo.h" #include "X86JITInfo.h" @@ -36,6 +35,8 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +#define DEBUG_TYPE "x86-emitter" + STATISTIC(NumEmitted, "Number of machine instructions emitted"); namespace { @@ -52,7 +53,7 @@ namespace { public: static char ID; explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) - : MachineFunctionPass(ID), II(0), TD(0), TM(tm), + : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm), MCE(mce), PICBaseOffset(0), Is64BitMode(false), IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} @@ -450,7 +451,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, intptr_t PCAdj) { const MachineOperand &Op3 = MI.getOperand(Op+3); int DispVal = 0; - const MachineOperand *DispForReloc = 0; + const MachineOperand *DispForReloc = nullptr; // Figure out what sort of displacement we have to handle here. if (Op3.isGlobal()) { @@ -1475,7 +1476,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, #ifndef NDEBUG dbgs() << "Cannot encode all operands of: " << MI << "\n"; #endif - llvm_unreachable(0); + llvm_unreachable(nullptr); } MCE.processDebugLoc(MI.getDebugLoc(), false); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 1aab1ea..56bcfa3 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -183,7 +183,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &ResultReg) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: @@ -363,7 +363,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // it works...). if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) if (const GlobalVariable *GVar = - dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal())) + dyn_cast_or_null<GlobalVariable>(GA->getAliasee())) if (GVar->isThreadLocal()) return false; @@ -406,7 +406,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { } else { // Issue load from stub. unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; X86AddressMode StubAM; StubAM.Base.Reg = AM.Base.Reg; StubAM.GV = GV; @@ -441,7 +441,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Now construct the final address. Note that the Disp, Scale, // and Index values may already be set here. AM.Base.Reg = LoadReg; - AM.GV = 0; + AM.GV = nullptr; return true; } } @@ -467,7 +467,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { SmallVector<const Value *, 32> GEPs; redo_gep: - const User *U = NULL; + const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast<Instruction>(V)) { // Don't walk into other basic blocks; it's possible we haven't @@ -626,7 +626,7 @@ redo_gep: /// X86SelectCallAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { - const User *U = NULL; + const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; const Instruction *I = dyn_cast<Instruction>(V); // Record if the value is defined in the same basic block. @@ -1247,7 +1247,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned CReg = 0, OpReg = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; @@ -1487,7 +1487,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { if (!Subtarget->hasCMov()) return false; unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; if (VT == MVT::i16) { Opc = X86::CMOVE16rr; RC = &X86::GR16RegClass; @@ -1821,10 +1821,10 @@ bool X86FastISel::FastLowerArguments() { } } - static const uint16_t GPR32ArgRegs[] = { + static const MCPhysReg GPR32ArgRegs[] = { X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D }; - static const uint16_t GPR64ArgRegs[] = { + static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; @@ -1865,7 +1865,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (cast<CallInst>(I)->isTailCall()) return false; - return DoSelectCall(I, 0); + return DoSelectCall(I, nullptr); } static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget, @@ -1936,8 +1936,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (!X86SelectCallAddress(Callee, CalleeAM)) return false; unsigned CalleeOp = 0; - const GlobalValue *GV = 0; - if (CalleeAM.GV != 0) { + const GlobalValue *GV = nullptr; + if (CalleeAM.GV != nullptr) { GV = CalleeAM.GV; } else if (CalleeAM.Base.Reg != 0) { CalleeOp = CalleeAM.Base.Reg; @@ -2163,7 +2163,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (Subtarget->is64Bit() && isVarArg && !isWin64) { // Count the number of XMM registers allocated. - static const uint16_t XMMArgRegs[] = { + static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; @@ -2387,7 +2387,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::i8: @@ -2437,7 +2437,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { // If the expression is just a basereg, then we're done, otherwise we need // to emit an LEA. if (AM.BaseType == X86AddressMode::RegBase && - AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; @@ -2510,7 +2510,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { // Get opcode and regclass for the given zero. unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: @@ -2558,7 +2558,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); - if (Result == 0) return false; + if (!Result) return false; FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index c2c234b..6c5b86f 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-fixup-LEAs" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" @@ -28,6 +27,8 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; +#define DEBUG_TYPE "x86-fixup-LEAs" + STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { @@ -56,6 +57,11 @@ namespace { void processInstruction(MachineBasicBlock::iterator& I, MachineFunction::iterator MFI); + /// \brief Given a LEA instruction which is unprofitable + /// on Silvermont try to replace it with an equivalent ADD instruction + void processInstructionForSLM(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + /// \brief Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. RegUsageState usesRegister(MachineOperand& p, @@ -85,7 +91,7 @@ namespace { private: MachineFunction *MF; const TargetMachine *TM; - const TargetInstrInfo *TII; // Machine instruction info. + const X86InstrInfo *TII; // Machine instruction info. }; char FixupLEAPass::ID = 0; @@ -97,7 +103,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineInstr* MI = MBBI; MachineInstr* NewMI; switch (MI->getOpcode()) { - case X86::MOV32rr: + case X86::MOV32rr: case X86::MOV64rr: { const MachineOperand& Src = MI->getOperand(1); const MachineOperand& Dest = MI->getOperand(0); @@ -123,7 +129,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, if (!MI->getOperand(2).isImm()) { // convertToThreeAddress will call getImm() // which requires isImm() to be true - return 0; + return nullptr; } break; case X86::ADD16rr: @@ -132,10 +138,10 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, // if src1 != src2, then convertToThreeAddress will // need to create a Virtual register, which we cannot do // after register allocation. - return 0; + return nullptr; } } - return TII->convertToThreeAddress(MFI, MBBI, 0); + return TII->convertToThreeAddress(MFI, MBBI, nullptr); } FunctionPass *llvm::createX86FixupLEAs() { @@ -143,9 +149,12 @@ FunctionPass *llvm::createX86FixupLEAs() { } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { - MF = &Func; - TM = &MF->getTarget(); - TII = TM->getInstrInfo(); + TM = &Func.getTarget(); + const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>(); + if (!ST.LEAusesAG() && !ST.slowLEA()) + return false; + + TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo()); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -211,7 +220,7 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); Found = getPreviousInstr(CurInst, MFI); } - return 0; + return nullptr; } void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, @@ -242,9 +251,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p, MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI); if (NewMI) { ++NumLEAs; - DEBUG(dbgs() << "Candidate to replace:"; MBI->dump();); + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); // now to replace with an equivalent LEA... - DEBUG(dbgs() << "Replaced by: "; NewMI->dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); MFI->erase(MBI); MachineBasicBlock::iterator J = static_cast<MachineBasicBlock::iterator> (NewMI); @@ -253,10 +262,80 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p, } } +void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + MachineInstr *MI = I; + const int opcode = MI->getOpcode(); + if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r && + opcode != X86::LEA64_32r) + return; + if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() || + !TII->isSafeToClobberEFLAGS(*MFI, I)) + return; + const unsigned DstR = MI->getOperand(0).getReg(); + const unsigned SrcR1 = MI->getOperand(1).getReg(); + const unsigned SrcR2 = MI->getOperand(3).getReg(); + if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR)) + return; + if (MI->getOperand(2).getImm() > 1) + return; + int addrr_opcode, addri_opcode; + switch (opcode) { + case X86::LEA16r: + addrr_opcode = X86::ADD16rr; + addri_opcode = X86::ADD16ri; + break; + case X86::LEA32r: + addrr_opcode = X86::ADD32rr; + addri_opcode = X86::ADD32ri; + break; + case X86::LEA64_32r: + case X86::LEA64r: + addrr_opcode = X86::ADD64rr; + addri_opcode = X86::ADD64ri32; + break; + default: + assert(false && "Unexpected LEA instruction"); + } + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = 0; + const MachineOperand &Dst = MI->getOperand(0); + // Make ADD instruction for two registers writing to LEA's destination + if (SrcR1 != 0 && SrcR2 != 0) { + const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3); + const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode)) + .addOperand(Dst) + .addOperand(Src1) + .addOperand(Src2); + MFI->insert(I, NewMI); + DEBUG(NewMI->dump();); + } + // Make ADD instruction for immediate + if (MI->getOperand(4).getImm() != 0) { + const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3); + NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode)) + .addOperand(Dst) + .addOperand(SrcR) + .addImm(MI->getOperand(4).getImm()); + MFI->insert(I, NewMI); + DEBUG(NewMI->dump();); + } + if (NewMI) { + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + } +} + bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { - for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) - processInstruction(I, MFI); + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { + if (TM->getSubtarget<X86Subtarget>().isSLM()) + processInstructionForSLM(I, MFI); + else + processInstruction(I, MFI); + } return false; } diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 7955ade..c8a3ab3 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -23,7 +23,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-codegen" #include "X86.h" #include "X86InstrInfo.h" #include "llvm/ADT/DepthFirstIterator.h" @@ -45,6 +44,8 @@ #include <algorithm> using namespace llvm; +#define DEBUG_TYPE "x86-codegen" + STATISTIC(NumFXCH, "Number of fxch instructions inserted"); STATISTIC(NumFP , "Number of floating point instructions"); @@ -430,7 +431,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { if (FPInstClass == X86II::NotFP) continue; // Efficiently ignore non-fp insts! - MachineInstr *PrevMI = 0; + MachineInstr *PrevMI = nullptr; if (I != BB.begin()) PrevMI = std::prev(I); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index f0ad4d1..4c1374f 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -182,7 +182,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } - MachineInstr *MI = NULL; + MachineInstr *MI = nullptr; if (UseLEA) { MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), @@ -204,7 +204,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. static void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = NULL) { + unsigned StackPtr, uint64_t *NumBytes = nullptr) { if (MBBI == MBB.begin()) return; MachineBasicBlock::iterator PI = std::prev(MBBI); @@ -225,11 +225,12 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } -/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator. +/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower +/// iterator. static void mergeSPUpdatesDown(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = NULL) { + unsigned StackPtr, uint64_t *NumBytes = nullptr) { // FIXME: THIS ISN'T RUN!!! return; @@ -257,19 +258,19 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB, } /// mergeSPUpdates - Checks the instruction before/after the passed -/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the -/// stack adjustment is returned as a positive value for ADD/LEA and a negative for -/// SUB. +/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and +/// the stack adjustment is returned as a positive value for ADD/LEA and a +/// negative for SUB. static int mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, - bool doMergeWithPrevious) { + MachineBasicBlock::iterator &MBBI, unsigned StackPtr, + bool doMergeWithPrevious) { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; - MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : std::next(MBBI); + MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr + : std::next(MBBI); unsigned Opc = PI->getOpcode(); int Offset = 0; @@ -366,8 +367,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = - MMI.addFrameInst(MCCFIInstruction::createOffset(0, DwarfReg, Offset)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex); + MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, + Offset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); } } @@ -446,7 +449,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { !MFI->adjustsStack() && // No calls. !IsWin64 && // Win64 has no Red Zone !usesTheStack(MF) && // Don't push and pop. - !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -511,15 +514,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Define the current CFA rule to use the provided offset. assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(0, 2 * stackGrowth)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)) + MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(0, DwarfFramePtr, 2 * stackGrowth)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)) + MCCFIInstruction::createOffset(nullptr, + DwarfFramePtr, 2 * stackGrowth)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } @@ -534,8 +538,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaRegister(0, DwarfFramePtr)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)) + MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } @@ -564,7 +568,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)) + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } @@ -698,9 +702,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Define the current CFA rule to use the provided offset. assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(0, -StackSize + stackGrowth)); + MCCFIInstruction::createDefCfaOffset(nullptr, + -StackSize + stackGrowth)); - BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)) + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } @@ -905,7 +910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } -int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1170,6 +1176,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD()) report_fatal_error("Segmented stacks not supported on this platform."); + // Eventually StackSize will be calculated by a link-time pass; which will + // also decide whether checking code needs to be injected into this particular + // prologue. + StackSize = MFI->getStackSize(); + + // Do not generate a prologue for functions with a stack of size zero + if (StackSize == 0) + return; + MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1194,11 +1209,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MF.push_front(allocMBB); MF.push_front(checkMBB); - // Eventually StackSize will be calculated by a link-time pass; which will - // also decide whether checking code needs to be injected into this particular - // prologue. - StackSize = MFI->getStackSize(); - // When the frame size is less than 256 we just compare the stack // boundary directly to the value of the stack pointer, per gcc. bool CompareStackPointer = StackSize < kSplitStackAvailable; @@ -1256,22 +1266,23 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { - // TlsOffset doesn't fit into a mod r/m byte so we need an extra register + // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. unsigned ScratchReg2; bool SaveScratch2; if (CompareStackPointer) { - // The primary scratch register is available for holding the TLS offset + // The primary scratch register is available for holding the TLS offset. ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); - // Unfortunately, with fastcc the second scratch register may hold an arg + // Unfortunately, with fastcc the second scratch register may hold an + // argument. SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); } - // If Scratch2 is live-in then it needs to be saved + // If Scratch2 is live-in then it needs to be saved. assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && "Scratch register is live-in and not saved"); @@ -1348,14 +1359,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) /// /// CheckStack: -/// temp0 = sp - MaxStack -/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart /// OldStart: -/// ... +/// ... /// IncStack: -/// call inc_stack # doubles the stack space -/// temp0 = sp - MaxStack -/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +/// call inc_stack # doubles the stack space +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { const X86InstrInfo &TII = *TM.getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1514,7 +1525,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; - MachineInstr *New = 0; + MachineInstr *New = nullptr; if (Opcode == TII.getCallFrameSetupOpcode()) { New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index f0db8cb..208bb8b 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -47,7 +47,7 @@ public: void adjustForHiPEPrologue(MachineFunction &MF) const override; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = NULL) const override; + RegScavenger *RS = nullptr) const override; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 3e45adb..74386d3 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-isel" #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" @@ -36,6 +35,8 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +#define DEBUG_TYPE "x86-isel" + STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); //===----------------------------------------------------------------------===// @@ -70,17 +71,18 @@ namespace { X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), - Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0), - SymbolFlags(X86II::MO_NO_FLAG) { + Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), + JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) { } bool hasSymbolicDisplacement() const { - return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0; + return GV != nullptr || CP != nullptr || ES != nullptr || + JT != -1 || BlockAddr != nullptr; } bool hasBaseOrIndexReg() const { return BaseType == FrameIndexBase || - IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; + IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } /// isRIPRelative - Return true if this addressing mode is already RIP @@ -102,14 +104,14 @@ namespace { void dump() { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; - if (Base_Reg.getNode() != 0) + if (Base_Reg.getNode()) Base_Reg.getNode()->dump(); else dbgs() << "nul"; dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' << " Scale" << Scale << '\n' << "IndexReg "; - if (IndexReg.getNode() != 0) + if (IndexReg.getNode()) IndexReg.getNode()->dump(); else dbgs() << "nul"; @@ -160,6 +162,13 @@ namespace { return "X86 DAG->DAG Instruction Selection"; } + bool runOnMachineFunction(MachineFunction &MF) override { + // Reset the subtarget each time through. + Subtarget = &TM.getSubtarget<X86Subtarget>(); + SelectionDAGISel::runOnMachineFunction(MF); + return true; + } + void EmitFunctionEntryCode() override; bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; @@ -374,14 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, else Ops.push_back(Chain.getOperand(i)); SDValue NewChain = - CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), - MVT::Other, &Ops[0], Ops.size()); + CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); Ops.clear(); Ops.push_back(NewChain); } for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i) Ops.push_back(OrigChain.getOperand(i)); - CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size()); + CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); @@ -390,7 +398,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, Ops.push_back(SDValue(Load.getNode(), 1)); for (unsigned i = 1, e = NumOps; i != e; ++i) Ops.push_back(Call.getOperand(i)); - CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps); + CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } /// isCalleeLoad - Return true if call address is a load and it can be @@ -612,7 +620,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ // gs:0 (or fs:0 on X86-64) contains its own address. // For more information see http://people.redhat.com/drepper/tls.pdf if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) - if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 && + if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && Subtarget->isTargetLinux()) switch (N->getPointerInfo().getAddrSpace()) { case 256: @@ -733,7 +741,7 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { // a smaller encoding and avoids a scaled-index. if (AM.Scale == 2 && AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == 0) { + AM.Base_Reg.getNode() == nullptr) { AM.Base_Reg = AM.IndexReg; AM.Scale = 1; } @@ -745,8 +753,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { Subtarget->is64Bit() && AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == 0 && - AM.IndexReg.getNode() == 0 && + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr && AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); @@ -926,7 +934,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, APInt MaskedHighBits = APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); APInt KnownZero, KnownOne; - DAG.ComputeMaskedBits(X, KnownZero, KnownOne); + DAG.computeKnownBits(X, KnownZero, KnownOne); if (MaskedHighBits != KnownZero) return true; // We've identified a pattern that can be transformed into a single shift @@ -1009,7 +1017,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::FrameIndex: if (AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == 0 && + AM.Base_Reg.getNode() == nullptr && (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); @@ -1018,7 +1026,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, break; case ISD::SHL: - if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; if (ConstantSDNode @@ -1052,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::SRL: { // Scale must not be used already. - if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; SDValue And = N.getOperand(0); if (And.getOpcode() != ISD::AND) break; @@ -1086,8 +1094,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case X86ISD::MUL_IMM: // X*[3,5,9] -> X+X*[2,4,8] if (AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == 0 && - AM.IndexReg.getNode() == 0) { + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || @@ -1237,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // with a constant to enable use of the scaled offset field. // Scale must not be used already. - if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; SDValue Shift = N.getOperand(0); if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; @@ -1276,7 +1284,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. - if (AM.IndexReg.getNode() == 0) { + if (!AM.IndexReg.getNode()) { AM.IndexReg = N; AM.Scale = 1; return false; @@ -1567,7 +1575,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) - return NULL; + return nullptr; MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; @@ -1756,7 +1764,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) - return 0; + return nullptr; SDLoc dl(Node); @@ -1768,13 +1776,13 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Val = Node->getOperand(2); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) - return 0; + return nullptr; // Which index into the table. enum AtomicOpc Op; switch (Node->getOpcode()) { default: - return 0; + return nullptr; case ISD::ATOMIC_LOAD_OR: Op = OR; break; @@ -1795,7 +1803,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { unsigned Opc = 0; switch (NVT.SimpleTy) { - default: return 0; + default: return nullptr; case MVT::i8: if (isCN) Opc = AtomicOpcTbl[Op][ConstantI8]; @@ -1847,7 +1855,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { } cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); + return CurDAG->getMergeValues(RetVals, dl).getNode(); } /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has @@ -1990,7 +1998,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, // Make a new TokenFactor with all the other input chains except // for the load. InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), - MVT::Other, &ChainOps[0], ChainOps.size()); + MVT::Other, ChainOps); } if (!ChainCheck) return false; @@ -2027,7 +2035,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { SDValue VMask = Node->getOperand(5); ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6)); if (!Scale) - return 0; + return nullptr; SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), MVT::Other); @@ -2058,7 +2066,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (Node->isMachineOpcode()) { DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); Node->setNodeId(-1); - return NULL; // Already selected. + return nullptr; // Already selected. } switch (Opcode) { @@ -2108,7 +2116,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDNode *RetVal = SelectGather(Node, Opc); if (RetVal) // We already called ReplaceUses inside SelectGather. - return NULL; + return nullptr; break; } } @@ -2259,7 +2267,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); - return NULL; + return nullptr; } case ISD::SMUL_LOHI: @@ -2386,7 +2394,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { - if (ResLo.getNode() == 0) { + if (!ResLo.getNode()) { assert(LoReg && "Register for low half is not defined!"); ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, InFlag); @@ -2397,7 +2405,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { - if (ResHi.getNode() == 0) { + if (!ResHi.getNode()) { assert(HiReg && "Register for high half is not defined!"); ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, InFlag); @@ -2407,7 +2415,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } - return NULL; + return nullptr; } case ISD::SDIVREM: @@ -2575,7 +2583,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { ReplaceUses(SDValue(Node, 1), Result); DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } - return NULL; + return nullptr; } case X86ISD::CMP: @@ -2632,7 +2640,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); - return NULL; + return nullptr; } // For example, "testl %eax, $2048" to "testb %ah, $8". @@ -2669,7 +2677,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); - return NULL; + return nullptr; } // For example, "testl %eax, $32776" to "testw %ax, $32776". @@ -2691,7 +2699,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); - return NULL; + return nullptr; } // For example, "testq %rax, $268468232" to "testl %eax, $268468232". @@ -2713,7 +2721,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); - return NULL; + return nullptr; } } break; @@ -2740,7 +2748,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue StoredVal = StoreNode->getOperand(1); unsigned Opc = StoredVal->getOpcode(); - LoadSDNode *LoadNode = 0; + LoadSDNode *LoadNode = nullptr; SDValue InputChain; if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, LoadNode, InputChain)) @@ -2772,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDNode *ResNode = SelectCode(Node); DEBUG(dbgs() << "=> "; - if (ResNode == NULL || ResNode == Node) + if (ResNode == nullptr || ResNode == Node) Node->dump(CurDAG); else ResNode->dump(CurDAG); @@ -2790,7 +2798,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, case 'v': // not offsetable ?? default: return true; case 'm': // memory - if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4)) + if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2a35061..cbaf44e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-isel" #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" @@ -23,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -52,6 +52,8 @@ #include <cctype> using namespace llvm; +#define DEBUG_TYPE "x86-isel" + STATISTIC(NumTailCalls, "Number of tail calls"); // Forward declarations. @@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, @@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() { // The _ftol2 runtime function has an unusual calling conv, which // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else if (TM.Options.EnableSegmentedStacks) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); @@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v8i16, Legal); + setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); @@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); + setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { @@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() { // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); + // There is no BLENDI for byte vectors. We don't need to custom lower + // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); - - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { @@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i16, Custom); - setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); @@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); @@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v16i16, Legal); // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, Legal); + setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::SDIV, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::SDIV, MVT::v16i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); @@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); @@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + if (!Subtarget->is64Bit()) + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() { if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, 0); - setLibcallName(RTLIB::SRL_I128, 0); - setLibcallName(RTLIB::SRA_I128, 0); + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. @@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() { } } + if (Subtarget->isTargetWin64()) { + setOperationAction(ISD::SDIV, MVT::i128, Custom); + setOperationAction(ISD::UDIV, MVT::i128, Custom); + setOperationAction(ISD::SREM, MVT::i128, Custom); + setOperationAction(ISD::UREM, MVT::i128, Custom); + setOperationAction(ISD::SDIVREM, MVT::i128, Custom); + setOperationAction(ISD::UDIVREM, MVT::i128, Custom); + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(MVT VT) const{ - const TargetRegisterClass *RRC = 0; + const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: @@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_X86); } -const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { - static const uint16_t ScratchRegs[] = { X86::R11, 0 }; +const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } @@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, - MVT::Other, &RetOps[0], RetOps.size()); + return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, InVals.push_back(ArgValue); } - // The x86-64 ABIs require that for returning structs by value we copy - // the sret argument into %rax/%eax (depending on ABI) for the return. - // Win32 requires us to put the sret argument to %eax as well. - // Save the argument into a virtual register so that we can access it - // from the return points. - if (MF.getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { - X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - unsigned Reg = FuncInfo->getSRetReturnReg(); - if (!Reg) { - MVT PtrTy = getPointerTy(); - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); - FuncInfo->setSRetReturnReg(Reg); + if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // Save the argument into a virtual register so that we can access it + // from the return points. + if (Ins[i].Flags.isSRet()) { + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + break; + } } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); } unsigned StackSize = CCInfo.getNextStackOffset(); @@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays - static const uint16_t GPR64ArgRegsWin64[] = { + static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const uint16_t GPR64ArgRegs64Bit[] = { + static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - static const uint16_t XMMArgRegs64Bit[] = { + static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const uint16_t *GPR64ArgRegs; + const MCPhysReg *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { @@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SaveXMMOps.push_back(Val); } MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, - &SaveXMMOps[0], SaveXMMOps.size())); + MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } } @@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). -static SDValue -EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, SDLoc dl) { +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. @@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; - if (isTailCall) { + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + if (IsMustTail) { + // Force this to be a tail call. The verifier rules are enough to ensure + // that we can lower this successfully without moving the return address + // around. + isTailCall = true; + } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, @@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; - if (isTailCall && !IsSibcall) { + if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); @@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) + if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains[0], MemOpChains.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT @@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. - static const uint16_t XMMArgRegs[] = { + static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; @@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and @@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) - continue; - assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - // Create frame index. - int32_t Offset = VA.getLocMemOffset()+FPDiff; - uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); - - if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); - - MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, - ArgChain, - Flags, DAG, dl)); - } else { - // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); - } + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Skip inalloca arguments. They don't require any work. + if (Flags.isInAlloca()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); } } if (!MemOpChains2.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains2[0], MemOpChains2.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, @@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. @@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { return true; } +/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to INSERTPS. +/// i. e: If all but one element come from the same vector. +static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { + // TODO: Deal with AVX's VINSERTPS + if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) + return false; + + unsigned CorrectPosV1 = 0; + unsigned CorrectPosV2 = 0; + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + if (Mask[i] == i) + ++CorrectPosV1; + else if (Mask[i] == i + 4) + ++CorrectPosV2; + + if (CorrectPosV1 == 3 || CorrectPosV2 == 3) + // We have 3 elements from one vector, and one from another. + return true; + + return false; +} + // // Some special combinations that can be optimized. // @@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { return true; } +// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or +// (src1[0], src0[1]), manipulation with 256-bit sub-vectors +static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) { + if (!VT.is512BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts/2; + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { + *Imm = 1; + return true; + } + } + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { + *Imm = 0; + return true; + } + } + return false; +} + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. @@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } +/// isZero - Returns true if Elt is a constant integer zero +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isNullValue(); +} + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) - return CN->isNullValue(); + if (isZero(Elt)) + return true; if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) return CFP->getValueAPF().isPosZero(); return false; @@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { /// isScalarLoadToVector - Returns true if the node is a scalar load that /// is promoted to a vector. It also returns the LoadSDNode by reference if /// required. -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) return false; N = N->getOperand(0).getNode(); @@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); SDValue Cst = DAG.getTargetConstant(0, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); @@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; @@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { - SDValue ThisElt(0, 0), LastElt(0, 0); + SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, @@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; @@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, + unsigned NonZeros, unsigned NumNonZero, + unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // We know there's at least one non-zero element + unsigned FirstNonZeroIdx = 0; + SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); + while (FirstNonZero.getOpcode() == ISD::UNDEF || + X86::isZeroNode(FirstNonZero)) { + ++FirstNonZeroIdx; + FirstNonZero = Op->getOperand(FirstNonZeroIdx); + } + + if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) + return SDValue(); + + SDValue V = FirstNonZero.getOperand(0); + MVT VVT = V.getSimpleValueType(); + if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) + return SDValue(); + + unsigned FirstNonZeroDst = + cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); + unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; + unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; + unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + + for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { + SDValue Elem = Op.getOperand(Idx); + if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) + continue; + + // TODO: What else can be here? Deal with it. + if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // TODO: Some optimizations are still possible here + // ex: Getting one element from a vector, and the rest from another. + if (Elem.getOperand(0) != V) + return SDValue(); + + unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); + if (Dst == Idx) + ++CorrectIdx; + else if (IncorrectIdx == -1U) { + IncorrectIdx = Idx; + IncorrectDst = Dst; + } else + // There was already one element with an incorrect index. + // We can't optimize this case to an insertps. + return SDValue(); + } + + if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { + SDLoc dl(Op); + EVT VT = Op.getSimpleValueType(); + unsigned ElementMoveMask = 0; + if (IncorrectIdx == -1U) + ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; + else + ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + + SDValue InsertpsMask = + DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); + } + + return SDValue(); +} + /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, @@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); - LoadSDNode *LDBase = NULL; + LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. @@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, - array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, @@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, unsigned ScalarSize = CVT.getSizeInBits(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { - const Constant *C = 0; + const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) @@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real +/// underlying vector and index. +/// +/// Modifies \p ExtractedFromVec to the real vector and returns the real +/// index. +static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, + SDValue ExtIdx) { + int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) + return Idx; + + // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already + // lowered this: + // (extract_vector_elt (v8f32 %vreg1), Constant<6>) + // to: + // (extract_vector_elt (vector_shuffle<2,u,u,u> + // (extract_subvector (v8f32 %vreg0), Constant<4>), + // undef) + // Constant<0>) + // In this case the vector is the extract_subvector expression and the index + // is 2, as specified by the shuffle. + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); + SDValue ShuffleVec = SVOp->getOperand(0); + MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); + assert(ShuffleVecVT.getVectorElementType() == + ExtractedFromVec.getSimpleValueType().getVectorElementType()); + + int ShuffleIdx = SVOp->getMaskElt(Idx); + if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { + ExtractedFromVec = ShuffleVec; + return ShuffleIdx; + } + return Idx; +} + static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) + return SDValue(); + int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); - // Quit if non-constant index. - if (!isa<ConstantSDNode>(ExtIdx)) - return SDValue(); - - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { - if (VecIn2.getNode() == 0) + if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } - unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); - if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); @@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } bool AllContants = true; uint64_t Immediate = 0; int NonConstIdx = -1; bool IsSplat = true; + unsigned NumNonConsts = 0; + unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) @@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(In)) { AllContants = false; NonConstIdx = idx; + NumNonConsts++; } - else if (cast<ConstantSDNode>(In)->getZExtValue()) + else { + NumConsts++; + if (cast<ConstantSDNode>(In)->getZExtValue()) Immediate |= (1ULL << idx); + } if (In != Op.getOperand(0)) IsSplat = false; } @@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } + if (NumNonConsts == 1 && NonConstIdx != 0) { + SDValue DstVec; + if (NumConsts) { + SDValue VecAsImm = DAG.getConstant(Immediate, + MVT::getIntegerVT(VT.getSizeInBits())); + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); + } + else + DstVec = DAG.getUNDEF(VT); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(NonConstIdx), + DAG.getIntPtrConstant(NonConstIdx)); + } if (!IsSplat && (NonConstIdx != 0)) llvm_unreachable("Unsupported BUILD_VECTOR operation"); MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; @@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. - SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); - SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], - NumElems/2); + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[0], NumElems/2)); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) @@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (V.getNode()) return V; } + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) { + SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, + NumZero, DAG, Subtarget, *this); + if (V.getNode()) + return V; + } + // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { @@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl, if (ShufVT != VT) V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, - PshufbMask.data(), PshufbMask.size())); + DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); } // v8i16 shuffles - Prefer shuffles in the following order: @@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), @@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), @@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); // As PSHUFB will zero elements with negative indices, it's safe to ignore // the 2nd operand if it's undefined or zero. @@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } @@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, unsigned Scale; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); + case MVT::v2i64: + case MVT::v2f64: + return SDValue(SVOp, 0); case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; @@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { - LoadSDNode *LD = NULL; + LoadSDNode *LD = nullptr; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) LD = dyn_cast<LoadSDNode>(SrcOp); if (!LD) { @@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Construct the output using a BUILD_VECTOR. - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], - SVOps.size()); + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. Output[l] = DAG.getUNDEF(NVT); @@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + +// It is only safe to call this function if isINSERTPSMask is true for +// this shufflevector mask. +static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, + SelectionDAG &DAG) { + // Generate an insertps instruction when inserting an f32 from memory onto a + // v4f32 or when copying a member from one v4f32 to another. + // We also use it for transferring i32 from one register to another, + // since it simply copies the same bits. + // If we're transferring an i32 from memory to a specific element in a + // register, we output a generic DAG that will match the PINSRD + // instruction. + MVT VT = SVOp->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + auto Mask = SVOp->getMask(); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "unsupported vector type for insertps/pinsrd"); + + int FromV1 = std::count_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }); + + SDValue From; + SDValue To; + unsigned DestIndex; + if (FromV1 == 1) { + From = V1; + To = V2; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }) - + Mask.begin(); + } else { + From = V2; + To = V1; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i >= 4; }) - + Mask.begin(); + } + + if (MayFoldLoad(From)) { + // Trivial case, when From comes from a load and is only used by the + // shuffle. Make it use insertps from the vector that we need from that + // load. + SDValue NewLoad = + NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); + + if (EVT == MVT::f32) { + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, + InsertpsMask); + } else { // EVT == MVT::i32 + // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT + // instruction, to match the PINSRD instruction, which loads an i32 to a + // certain vector element. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, + DAG.getConstant(DestIndex, MVT::i32)); + } + } + + // Vector-element-to-vector + unsigned SrcIndex = Mask[DestIndex] % 4; + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); +} + // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } -static SDValue -NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8 || - VT == MVT::v16i16 || VT == MVT::v32i8) { + if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || + VT == MVT::v32i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); - } else if ((VT == MVT::v4i32 || - (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { + } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { // FIXME: Figure out a cleaner way to do this. - // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT, true, false)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(0), - DAG, Subtarget, dl); + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, + dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(1), - DAG, Subtarget, dl); + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, + dl); } } } @@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShuffleSHUFImmediate(SVOp), DAG); } + unsigned Idx; + if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) + return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), + Idx*(NumElems/2), DAG, dl); + // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, @@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; + if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) + return getINSERTPS(SVOp, dl, DAG); + unsigned Imm8; if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); @@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, - &permclMask[0], NumElems); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); if (V2IsUndef) // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 return DAG.getNode(X86ISD::VPERMV, dl, VT, @@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// This function assumes its argument is a BUILD_VECTOR of constants or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa<ConstantSDNode>(EltCond)) + Lane1Cond = !isZero(EltCond); + if (isa<ConstantSDNode>(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + // Lane1Cond != 0, means we want the first argument. + // Lane1Cond == 0, means we want the second argument. + // The encoding of this argument is 0 for the first argument, 1 + // for the second. Therefore, invert the condition. + MaskValue |= !Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !Lane2Cond << i; + else + return false; + } + return true; +} + +// Try to lower a vselect node into a simple blend instruction. +static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + // Some types for vselect were previously set to Expand, not Legal or + // Custom. Return an empty SDValue so we fall-through to Expand, after + // the Custom lowering phase. + MVT VT = Op.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + break; + case MVT::v8i16: + case MVT::v16i16: + return SDValue(); + } + + // We couldn't create a "Blend with immediate" node. + // This node should still be legal, but we'll have to emit a blendv* + // instruction. + return Op; +} + static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Insert one bit to mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + MVT VecVT = Vec.getSimpleValueType(); + + if (!isa<ConstantSDNode>(Idx)) { + // Non constant index. Extend source and destination, + // insert element and then truncate the result. + MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, MVT::i8)); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + unsigned MaxSift = rc->getSize()*8 - 1; + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); +} SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + + if (EltVT == MVT::i1) + return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); @@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } @@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SDValue Base; if (is64Bit) { - Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; @@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Args[] = { Chain, Offset }; - Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array - // If GV is an alias then use the aliasee for determining - // thread-localness. - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GV = GA->getAliasedGlobal(); SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); @@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, - Tys, Ops, array_lengthof(Ops), - SrcVT, MMO); + Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); @@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, array_lengthof(Ops), - Op.getValueType(), MMO); + Ops, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, false, 0); @@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, - array_lengthof(Ops), MVT::i64, MMO); + MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); @@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, - array_lengthof(Ops), DstTy, MMO); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), DstTy, - MMO); + Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, @@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) - : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) + : DAG.getMergeValues(Ops, DL); return std::make_pair(pair, SDValue()); } } @@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, - &pshufbMask[0], 32); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); @@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (FIST.getNode() == 0) return Op; + if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. @@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, VecIns.back(), VecIns.back()); } +/// \brief return true if \c Op has a use that doesn't just read flags. +static bool hasNonFlagsUse(SDValue Op) { + for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; + ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) + return true; + } + return false; +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - SDLoc dl(Op); - if (Op.getValueType() == MVT::i1) // KORTEST instruction should be selected return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, @@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, Opcode = X86ISD::ADD; NumOperands = 2; break; - case ISD::AND: { - // If the primary and result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. - bool NonFlagUse = false; - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - unsigned UOpNo = UI.getOperandNo(); - if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { - // Look pass truncate. - UOpNo = User->use_begin().getOperandNo(); - User = *User->use_begin(); - } - - if (User->getOpcode() != ISD::BRCOND && - User->getOpcode() != ISD::SETCC && - !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { - NonFlagUse = true; + case ISD::SHL: + case ISD::SRL: + // If we have a constant logical shift that's only used in a comparison + // against zero turn it into an equivalent AND. This allows turning it into + // a TEST instruction later. + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned ShAmt = Op->getConstantOperandVal(1); + if (ShAmt >= BitWidth) // Avoid undefined shifts. break; - } + APInt Mask = ArithOp.getOpcode() == ISD::SRL + ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) + : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + if (!Mask.isSignedIntN(32)) // Avoid large immediates. + break; + SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), + DAG.getConstant(Mask, VT)); + DAG.ReplaceAllUsesWith(Op, New); + Op = New; } + break; - if (!NonFlagUse) + case ISD::AND: + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + if (!hasNonFlagsUse(Op)) break; - } // FALL THROUGH case ISD::SUB: case ISD::OR: @@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, for (unsigned i = 0; i != NumOperands; ++i) Ops.push_back(Op.getOperand(i)); - SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } @@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - SelectionDAG &DAG) const { - SDLoc dl(Op0); + SDLoc dl, SelectionDAG &DAG) const { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, DAG); + return EmitTest(Op0, X86CC, dl, DAG); if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); @@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { APInt Zeros, Ones; - DAG.ComputeMaskedBits(Op0, Zeros, Ones); + DAG.computeKnownBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } @@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. -static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG) +static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); if (!BV) @@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG) ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(), - ULTOp1.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, @@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget->hasAVX()) break; - SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG); + SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); if (ULEOp1.getNode()) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; @@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (X86CC == X86::COND_INVALID) return SDValue(); - SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), EFLAGS); @@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isNullValue(); -} - static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; @@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNOT(DL, Res, Res.getValueType()); ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - if (N2C == 0 || !N2C->isNullValue()) + if (!N2C || !N2C->isNullValue()) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; - return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { @@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - assert((Subtarget->isOSWindows() || - getTargetMachine().Options.EnableSegmentedStacks) && - "This should be used only on Windows targets or when segmented stacks " - "are being used"); - assert(!Subtarget->isTargetMacho() && "Not implemented"); + MachineFunction &MF = DAG.getMachineFunction(); + bool SplitStack = MF.shouldSplitStack(); + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + SplitStack; SDLoc dl(Op); + if (!Lower) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDNode* Node = Op.getNode(); + + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" + " not tell us which reg is the stack pointer!"); + EVT VT = Node->getValueType(0); + SDValue Tmp1 = SDValue(Node, 0); + SDValue Tmp2 = SDValue(Node, 1); + SDValue Tmp3 = Node->getOperand(2); + SDValue Chain = Tmp1.getOperand(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), + SDLoc(Node)); + + SDValue Size = Tmp2.getOperand(1); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + unsigned StackAlign = TFI.getStackAlignment(); + Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (Align > StackAlign) + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Align, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + + Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), + DAG.getIntPtrConstant(0, true), SDValue(), + SDLoc(Node)); + + SDValue Ops[2] = { Tmp1, Tmp2 }; + return DAG.getMergeValues(Ops, dl); + } + // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); @@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (getTargetMachine().Options.EnableSegmentedStacks) { - MachineFunction &MF = DAG.getMachineFunction(); + if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); @@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } } @@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - &MemOps[0], MemOps.size()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { InstOps.push_back(DAG.getConstant(Align, MVT::i32)); SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, - VTs, &InstOps[0], InstOps.size(), - MVT::i64, + VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, /*Volatile=*/false, @@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Fold this packed shift into its first operand if ShiftAmt is 0. + if (ShiftAmt == 0) + return SrcOp; + // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) @@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, ConstantSDNode *ND; switch(Opc) { - default: llvm_unreachable(0); + default: llvm_unreachable(nullptr); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); @@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, break; } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); @@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); // The return type has to be a 128-bit type with the same element // type as the input type. @@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulhu_w: + case Intrinsic::x86_avx2_pmulhu_w: + return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulh_w: + case Intrinsic::x86_avx2_pmulh_w: + return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // SSE2/AVX2 sub with unsigned saturation intrinsics case Intrinsic::x86_sse2_psubus_b: case Intrinsic::x86_sse2_psubus_w: @@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), SDValue(PCMP.getNode(), 1)); @@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: @@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain, - const X86Subtarget * Subtarget) { - SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); - SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); - EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getConstant(~0, MaskVT); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); - SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); -} - -static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { @@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); + return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain) { + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); @@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } -static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain) { +static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Mask, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); - SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - return SDValue(Res, 1); + EVT MaskVT = + MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + //SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); + return SDValue(Res, 0); +} + +// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that +// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is +// also used to custom lower READCYCLECOUNTER nodes. +static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); + SDValue LO, HI; + + // The processor's time-stamp counter (a 64-bit MSR) is stored into the + // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR + // and the EAX register is loaded with the low-order 32 bits. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + SDValue Chain = HI.getValue(1); + + if (Opcode == X86ISD::RDTSCP_DAG) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, + HI.getValue(2)); + // Explicitly store the content of ECX at the location passed in input + // to the 'rdtscp' intrinsic. + Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), + MachinePointerInfo(), false, false, 0); + } + + if (Subtarget->is64Bit()) { + // The EDX register is loaded with the high-order 32 bits of the MSR, and + // the EAX register is loaded with the low-order 32 bits. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SmallVector<SDValue, 2> Results; + SDLoc DL(Op); + getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + return DAG.getMergeValues(Results, DL); +} + +enum IntrinsicType { + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST +}; + +struct IntrinsicData { + IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1) + :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {} + IntrinsicType Type; + unsigned Opc0; + unsigned Opc1; +}; + +std::map < unsigned, IntrinsicData> IntrMap; +static void InitIntinsicsMap() { + static bool Initialized = false; + if (Initialized) + return; + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, + IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, + IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512, + IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512, + IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512, + IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, + IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, + IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, + IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, + IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0))); + + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512, + IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, + IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, + IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, + IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, + IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, + IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, + IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, + IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0))); + + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm, + X86::VGATHERPF1QPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm, + X86::VGATHERPF1QPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm, + X86::VGATHERPF1DPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm, + X86::VGATHERPF1DPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm, + X86::VSCATTERPF1QPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm, + X86::VSCATTERPF1QPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm, + X86::VSCATTERPF1DPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm, + X86::VSCATTERPF1DPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_xtest, + IntrinsicData(XTEST, X86ISD::XTEST, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc, + IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, + IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + Initialized = true; } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDLoc dl(Op); + InitIntinsicsMap(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. + std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo); + if (itr == IntrMap.end()) + return SDValue(); - // RDRAND/RDSEED intrinsics. - case Intrinsic::x86_rdrand_16: - case Intrinsic::x86_rdrand_32: - case Intrinsic::x86_rdrand_64: - case Intrinsic::x86_rdseed_16: - case Intrinsic::x86_rdseed_32: - case Intrinsic::x86_rdseed_64: { - unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || - IntNo == Intrinsic::x86_rdseed_32 || - IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : - X86ISD::RDRAND; + SDLoc dl(Op); + IntrinsicData Intr = itr->second; + switch(Intr.Type) { + case RDSEED: + case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. @@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops, array_lengthof(Ops)); + Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } - //int_gather(index, base, scale); - case Intrinsic::x86_avx512_gather_qpd_512: - case Intrinsic::x86_avx512_gather_qps_512: - case Intrinsic::x86_avx512_gather_dpd_512: - case Intrinsic::x86_avx512_gather_qpi_512: - case Intrinsic::x86_avx512_gather_qpq_512: - case Intrinsic::x86_avx512_gather_dpq_512: - case Intrinsic::x86_avx512_gather_dps_512: - case Intrinsic::x86_avx512_gather_dpi_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; - } - SDValue Chain = Op.getOperand(0); - SDValue Index = Op.getOperand(2); - SDValue Base = Op.getOperand(3); - SDValue Scale = Op.getOperand(4); - return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); - } - //int_gather_mask(v1, mask, index, base, scale); - case Intrinsic::x86_avx512_gather_qps_mask_512: - case Intrinsic::x86_avx512_gather_qpd_mask_512: - case Intrinsic::x86_avx512_gather_dpd_mask_512: - case Intrinsic::x86_avx512_gather_dps_mask_512: - case Intrinsic::x86_avx512_gather_qpi_mask_512: - case Intrinsic::x86_avx512_gather_qpq_mask_512: - case Intrinsic::x86_avx512_gather_dpi_mask_512: - case Intrinsic::x86_avx512_gather_dpq_mask_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_gather_qps_mask_512: - Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_mask_512: - Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_mask_512: - Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_mask_512: - Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_mask_512: - Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_mask_512: - Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_mask_512: - Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_mask_512: - Opc = X86::VPGATHERDQZrm; break; - } + case GATHER: { + //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); + SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); - SDValue Base = Op.getOperand(5); + SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } - //int_scatter(base, index, v1, scale); - case Intrinsic::x86_avx512_scatter_qpd_512: - case Intrinsic::x86_avx512_scatter_qps_512: - case Intrinsic::x86_avx512_scatter_dpd_512: - case Intrinsic::x86_avx512_scatter_qpi_512: - case Intrinsic::x86_avx512_scatter_qpq_512: - case Intrinsic::x86_avx512_scatter_dpq_512: - case Intrinsic::x86_avx512_scatter_dps_512: - case Intrinsic::x86_avx512_scatter_dpi_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_scatter_qpd_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_512: - Opc = X86::VPSCATTERDDZmr; break; - } - SDValue Chain = Op.getOperand(0); - SDValue Base = Op.getOperand(2); - SDValue Index = Op.getOperand(3); - SDValue Src = Op.getOperand(4); - SDValue Scale = Op.getOperand(5); - return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); - } - //int_scatter_mask(base, mask, index, v1, scale); - case Intrinsic::x86_avx512_scatter_qps_mask_512: - case Intrinsic::x86_avx512_scatter_qpd_mask_512: - case Intrinsic::x86_avx512_scatter_dpd_mask_512: - case Intrinsic::x86_avx512_scatter_dps_mask_512: - case Intrinsic::x86_avx512_scatter_qpi_mask_512: - case Intrinsic::x86_avx512_scatter_qpq_mask_512: - case Intrinsic::x86_avx512_scatter_dpi_mask_512: - case Intrinsic::x86_avx512_scatter_dpq_mask_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_scatter_qpd_mask_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_mask_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_mask_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_mask_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_mask_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_mask_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_mask_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_mask_512: - Opc = X86::VPSCATTERDDZmr; break; - } + case SCATTER: { + //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + } + case PREFETCH: { + SDValue Hint = Op.getOperand(6); + unsigned HintVal; + if (dyn_cast<ConstantSDNode> (Hint) == 0 || + (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) + llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); + unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); + SDValue Chain = Op.getOperand(0); + SDValue Mask = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Base = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + } + // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). + case RDTSC: { + SmallVector<SDValue, 2> Results; + getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. - case Intrinsic::x86_xtest: { + case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, @@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Ret, SDValue(InTrans.getNode(), 1)); } } + llvm_unreachable("Unknown Intrinsic Type"); } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, @@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned X86TargetLowering::getRegisterByName(const char* RegName, + EVT VT) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("esp", X86::ESP) + .Case("rsp", X86::RSP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = @@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 22), false, false, 0); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); @@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 6), false, false, 1); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } @@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), MVT::i16, - MMO); + Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, @@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); @@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit @@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT EltTy = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - SDValue N0 = Op.getOperand(0); +SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWin64() && "Unexpected target"); + EVT VT = Op.getValueType(); + assert(VT.isInteger() && VT.getSizeInBits() == 128 && + "Unexpected return type for lowering"); + + RTLIB::Libcall LC; + bool isSigned; + switch (Op->getOpcode()) { + default: llvm_unreachable("Unexpected request for libcall!"); + case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; + case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; + case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; + case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; + case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; + case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; + } + SDLoc dl(Op); + SDValue InChain = DAG.getEntryNode(); - // Lower sdiv X, pow2-const. - BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); - if (!C) - return SDValue(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && + "Unexpected argument type for lowering"); + SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); + Entry.Node = StackPtr; + InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), + false, false, 16); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Ty = PointerType::get(ArgTy,0); + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy()); - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs) || - EltTy.getSizeInBits() < SplatBitSize) - return SDValue(); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(InChain) + .setCallee(getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), + Callee, &Args, 0) + .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + + std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); + return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); +} + +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); + EVT VT = Op0.getValueType(); + SDLoc dl(Op); - if ((SplatValue != 0) && - (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { - unsigned Lg2 = SplatValue.countTrailingZeros(); - // Splat the sign bit. - SmallVector<SDValue, 16> Sz(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - 1, - EltTy)); - SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], - NumElts)); - // Add (N0 < 0) ? abs2 - 1 : 0; - SmallVector<SDValue, 16> Amt(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - Lg2, - EltTy)); - SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], - NumElts)); - SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); - SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); - SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], - NumElts)); - - // If we're dividing by a positive value, we're done. Otherwise, we must - // negate the result. - if (SplatValue.isNonNegative()) - return SRA; - - SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); - SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); - return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || + (VT == MVT::v8i32 && Subtarget->hasInt256())); + + // Get the high parts. + const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; + SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + + // Emit two multiplies, one for the lower 2 ints and one for the higher 2 + // ints. + MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; + unsigned Opcode = + (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + + // Shuffle it back into the right order. + const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; + SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; + SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + + // If we have a signed multiply but no PMULDQ fix up the high parts of a + // unsigned multiply. + if (IsSigned && !Subtarget->hasSSE41()) { + SDValue ShAmt = + DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + SDValue T1 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); + SDValue T2 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); + + SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); + Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return SDValue(); + + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = 0; for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); @@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); @@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (!BaseShAmt.getNode()) BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, DAG.getIntPtrConstant(0)); } @@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } @@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getNode(ISD::MUL, dl, VT, Op, R); } + // If possible, lower this shift as a sequence of two shifts by + // constant plus a MOVSS/MOVSD instead of scalarizing it. + // Example: + // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) + // + // Could be rewritten as: + // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) + // + // The advantage is that the two shifts from the example would be + // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing + // the vector shift into four scalar shifts plus four pairs of vector + // insert/extract. + if ((VT == MVT::v8i16 || VT == MVT::v4i32) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + unsigned TargetOpcode = X86ISD::MOVSS; + bool CanBeSimplified; + // The splat value for the first packed shift (the 'X' from the example). + SDValue Amt1 = Amt->getOperand(0); + // The splat value for the second packed shift (the 'Y' from the example). + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : + Amt->getOperand(2); + + // See if it is possible to replace this node with a sequence of + // two shifts followed by a MOVSS/MOVSD + if (VT == MVT::v4i32) { + // Check if it is legal to use a MOVSS. + CanBeSimplified = Amt2 == Amt->getOperand(2) && + Amt2 == Amt->getOperand(3); + if (!CanBeSimplified) { + // Otherwise, check if we can still simplify this node using a MOVSD. + CanBeSimplified = Amt1 == Amt->getOperand(1) && + Amt->getOperand(2) == Amt->getOperand(3); + TargetOpcode = X86ISD::MOVSD; + Amt2 = Amt->getOperand(2); + } + } else { + // Do similar checks for the case where the machine value type + // is MVT::v8i16. + CanBeSimplified = Amt1 == Amt->getOperand(1); + for (unsigned i=3; i != 8 && CanBeSimplified; ++i) + CanBeSimplified = Amt2 == Amt->getOperand(i); + + if (!CanBeSimplified) { + TargetOpcode = X86ISD::MOVSD; + CanBeSimplified = true; + Amt2 = Amt->getOperand(4); + for (unsigned i=0; i != 4 && CanBeSimplified; ++i) + CanBeSimplified = Amt1 == Amt->getOperand(i); + for (unsigned j=4; j != 8 && CanBeSimplified; ++j) + CanBeSimplified = Amt2 == Amt->getOperand(j); + } + } + + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && + isa<ConstantSDNode>(Amt2)) { + // Replace this node with two shifts followed by a MOVSS/MOVSD. + EVT CastVT = MVT::v4i32; + SDValue Splat1 = + DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); + SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); + SDValue Splat2 = + DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); + SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); + if (TargetOpcode == X86ISD::MOVSD) + CastVT = MVT::v2i64; + SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); + SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); + SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, + BitCast1, DAG); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); @@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt1Csts[0], NumElems/2); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt2Csts[0], NumElems/2); + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); } else { // Variable shift amount Amt1 = Extract128BitVector(Amt, 0, DAG, dl); @@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, array_lengthof(Ops), T, MMO); + Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; } -static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - assert(Subtarget->is64Bit() && "Result not type legalized?"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = Op.getOperand(0); - SDLoc dl(Op); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); - SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, - rax.getValue(2)); - SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, - DAG.getConstant(32, MVT::i8)); - SDValue Ops[] = { - DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), - rdx.getValue(1) - }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); -} - static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (DstVT != MVT::f64) + // This conversion needs to be expanded. + return SDValue(); + + SDValue InVec = Op->getOperand(0); + SDLoc dl(Op); + unsigned NumElts = SrcVT.getVectorNumElements(); + EVT SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, + DAG.getIntPtrConstant(i))); + + // Explicitly mark the extra elements as Undef. + SDValue Undef = DAG.getUNDEF(SVT); + for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) + Elts.push_back(Undef); + + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); + SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, + DAG.getIntPtrConstant(0)); + } + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || @@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { cast<AtomicSDNode>(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, - cast<AtomicSDNode>(Node)->getSrcValue(), - cast<AtomicSDNode>(Node)->getAlignment(), + cast<AtomicSDNode>(Node)->getMemOperand(), cast<AtomicSDNode>(Node)->getOrdering(), cast<AtomicSDNode>(Node)->getSynchScope()); } @@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) : (Type*)VectorType::get(ArgTy, 4); - TargetLowering:: - CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, - false, false, false, false, 0, - CallingConv::C, /*isTaillCall=*/false, - /*doesNotRet=*/false, /*isReturnValueUsed*/true, - Callee, Args, DAG, dl); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) + .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); if (isF64) @@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); @@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::UMUL_LOHI: + case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } @@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, SDValue Ops[] = { Chain, In1, In2L, In2H }; SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, cast<MemSDNode>(Node)->getMemOperand()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); Results.push_back(Result.getValue(2)); } @@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); + Results.push_back(V); + return; + } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; @@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode() != 0) { + if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - if (StackSlot.getNode() != 0) + if (StackSlot.getNode()) Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0)); @@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default : llvm_unreachable("Do not know how to custom type " + "legalize this intrinsic operation!"); + case Intrinsic::x86_rdtsc: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + case Intrinsic::x86_rdtscp: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + Results); + } + } case ISD::READCYCLECOUNTER: { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = N->getOperand(0); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, - rd.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, - eax.getValue(2)); - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { eax, edx }; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, - array_lengthof(Ops))); - Results.push_back(edx.getValue(1)); - return; + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); } case ISD::ATOMIC_CMP_SWAP: { EVT T = N->getValueType(0); @@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; - SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, - Ops, array_lengthof(Ops), T, MMO); + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(cpOutH.getValue(1)); return; } @@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); return; } - case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); + return; + } + case ISD::BITCAST: { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + EVT DstVT = N->getValueType(0); + EVT SrcVT = N->getOperand(0)->getValueType(0); + + if (SrcVT != MVT::f64 || + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) + return; + + unsigned NumElts = DstVT.getVectorNumElements(); + EVT SVT = DstVT.getVectorElementType(); + EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, N->getOperand(0)); + SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + + SmallVector<SDValue, 8> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, + ToVecInt, DAG.getIntPtrConstant(i))); + + Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); + } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - default: return NULL; + default: return nullptr; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; @@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BZHI: return "X86ISD::BZHI"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; @@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; + case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; @@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; + case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. - if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { @@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (VT.getSizeInBits() == 64) return false; - // FIXME: pshufb, blends, shifts. + // If this is a single-input shuffle with no 128 bit lane crossings we can + // lower it into pshufb. + if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || + (SVT.is256BitVector() && Subtarget->hasInt256())) { + bool isLegal = true; + for (unsigned I = 0, E = M.size(); I != E; ++I) { + if (M[I] >= (int)SVT.getVectorNumElements() || + ShuffleCrosses128bitLane(SVT, I, M[I])) { + isLegal = false; + break; + } + } + if (isLegal) + return true; + } + + // FIXME: blends, shifts. return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || @@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( OffsetDestReg = 0; // unused OverflowDestReg = DestReg; - offsetMBB = NULL; + offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { @@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(getTargetMachine().Options.EnableSegmentedStacks); + assert(MF->shouldSplitStack()); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // X86 Optimization Hooks //===----------------------------------------------------------------------===// -void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { +void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || @@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } } -unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, - unsigned Depth) const { +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &, + unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) return Op.getValueType().getScalarType().getSizeInBits(); @@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, - array_lengthof(Ops), Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), @@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, return std::make_pair(Opc, NeedSplit); } +static SDValue +TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + MVT VT = N->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + SmallVector<int, 8> ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. - // FIXME: Would it be better to use ComputeMaskedBits to determine whether + // FIXME: Would it be better to use computeKnownBits to determine whether // it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && @@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && - !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { + !DCI.isBeforeLegalize() && + // We explicitly check against v8i16 and v16i16 because, although + // they're marked as Custom, they might only be legal when Cond is a + // build_vector of constants. This will be taken care in a later + // condition. + (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && + VT != MVT::v8i16)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. @@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, DCI.CommitTargetLoweringOpt(TLO); } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + return SDValue(); } @@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; - const ConstantSDNode* C = 0; + const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? @@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } // If this is a select between two integer constants, try to do some @@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. - ConstantSDNode *CmpAgainst = 0; + ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && !isa<ConstantSDNode>(Cond.getOperand(0))) { @@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, MVT::i8), Cond }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, - array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } @@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); + // SSE/AVX/AVX2 blend intrinsics. + case Intrinsic::x86_avx2_pblendvb: + case Intrinsic::x86_avx2_pblendw: + case Intrinsic::x86_avx2_pblendd_128: + case Intrinsic::x86_avx2_pblendd_256: + // Don't try to simplify this intrinsic if we don't have AVX2. + if (!Subtarget->hasAVX2()) + return SDValue(); + // FALL-THROUGH + case Intrinsic::x86_avx_blend_pd_256: + case Intrinsic::x86_avx_blend_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx_blendv_ps_256: + // Don't try to simplify this intrinsic if we don't have AVX. + if (!Subtarget->hasAVX()) + return SDValue(); + // FALL-THROUGH + case Intrinsic::x86_sse41_pblendw: + case Intrinsic::x86_sse41_blendpd: + case Intrinsic::x86_sse41_blendps: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_sse41_pblendvb: { + SDValue Op0 = N->getOperand(1); + SDValue Op1 = N->getOperand(2); + SDValue Mask = N->getOperand(3); + + // Don't try to simplify this intrinsic if we don't have SSE4.1. + if (!Subtarget->hasSSE41()) + return SDValue(); + + // fold (blend A, A, Mask) -> A + if (Op0 == Op1) + return Op0; + // fold (blend A, B, allZeros) -> A + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return Op0; + // fold (blend A, B, allOnes) -> B + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return Op1; + + // Simplify the case where the mask is a constant i32 value. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { + if (C->isNullValue()) + return Op0; + if (C->isAllOnesValue()) + return Op1; + } + } + + // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: { + SDValue Op0 = N->getOperand(1); + SDValue Op1 = N->getOperand(2); + EVT VT = Op0.getValueType(); + assert(VT.isVector() && "Expected a vector type!"); + + if (isa<BuildVectorSDNode>(Op1)) + Op1 = Op1.getOperand(0); + + if (!isa<ConstantSDNode>(Op1)) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + + ConstantSDNode *CND = cast<ConstantSDNode>(Op1); + const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + + // Don't try to convert this shift into a ISD::SRA if the shift + // count is bigger than or equal to the element size. + if (ShAmt >= SVTBits) + return SDValue(); + + // Trivial case: if the shift count is zero, then fold this + // into the first operand. + if (ShAmt == 0) + return Op0; + + // Replace this packed shift intrinsic with a target independent + // shift dag node. + SDValue Splat = DAG.getConstant(C, VT); + return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); + } + } +} + /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. @@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), N1->getOperand(0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); - N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { N1 = N1->getOperand(0); } @@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - // Create BEXTR and BZHI instructions - // BZHI is X & ((1 << Y) - 1) + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (Subtarget->hasBMI2()) { - // Check for (and (add (shl 1, Y), -1), X) - if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::SHL) { - SDValue N001 = N00.getOperand(1); - assert(N001.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); - } - } - - // Check for (and X, (add (shl 1, Y), -1)) - if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::SHL) { - SDValue N101 = N10.getOperand(1); - assert(N101.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); - } - } - } - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { SDValue Ops[] = { N0.getOperand(0), Neg, DAG.getConstant(X86::COND_GE, MVT::i8), SDValue(Neg.getNode(), 1) }; - return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } return SDValue(); } @@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. @@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Chains.push_back(Ch); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, !cast<LoadSDNode>(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { SDNode* LdVal = St->getValue().getNode(); - LoadSDNode *Ld = 0; + LoadSDNode *Ld = nullptr; int TokenFactorIndex = -1; SmallVector<SDValue, 8> Ops; SDNode* ChainVal = St->getChain().getNode(); @@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getPointerInfo(), @@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, if (TokenFactorIndex != -1) { Ops.push_back(LoLd); Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); @@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "X86insertps is only defined for v4x32"); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + // Extract the countS bits from the immediate so we can get the proper + // address when narrowing the vector load to a specific element. + // When the second source op is a memory address, interps doesn't use + // countS and just gets an f32 from that address. + unsigned DestIndex = + cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); + } else + return SDValue(); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND_INREG: + return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); @@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: + return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); @@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. - if (CallOperandVal == NULL) + if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. @@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { - SDValue Result(0, 0); + SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; @@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = 0; + GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. @@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); // Not found as a standard register? - if (Res.second == 0) { + if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && @@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } + +int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, + Type *Ty) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%drx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + if (isLegalAddressingMode(AM, Ty)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 0f0d17b..9f51b53 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -83,6 +83,9 @@ namespace llvm { /// readcyclecounter RDTSC_DAG, + /// X86 Read Time-Stamp Counter and Processor ID. + RDTSCP_DAG, + /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -291,7 +294,6 @@ namespace llvm { ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - BZHI, // BZHI - Zero high bits BEXTR, // BEXTR - Bit field extract UMUL, // LOW, HI, FLAGS = umul LHS, RHS @@ -345,6 +347,8 @@ namespace llvm { // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, + // PMULUDQ - Vector multiply packed signed doubleword integers + PMULDQ, // FMA nodes FMADD, @@ -614,18 +618,19 @@ namespace llvm { /// getSetCCResultType - Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - /// computeMaskedBitsForTargetNode - Determine which of the bits specified + /// computeKnownBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. - void computeMaskedBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; // ComputeNumSignBitsForTargetNode - Determine the number of bits in the // operation that are sign bits. unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + const SelectionDAG &DAG, unsigned Depth) const override; bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, @@ -679,6 +684,12 @@ namespace llvm { /// the immediate into a register. bool isLegalAddImmediate(int64_t Imm) const override; + /// \brief Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override; bool isVectorShiftByScalarCheap(Type *Ty) const override; @@ -771,10 +782,12 @@ namespace llvm { Type *Ty) const override; /// Intel processors have a unified instruction and data cache - const char * getClearCacheBuiltinName() const { - return 0; // nothing to do, move along. + const char * getClearCacheBuiltinName() const override { + return nullptr; // nothing to do, move along. } + unsigned getRegisterByName(const char* RegName, EVT VT) const override; + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -871,8 +884,11 @@ namespace llvm { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; + SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -908,6 +924,7 @@ namespace llvm { SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFormalArguments(SDValue Chain, @@ -936,7 +953,7 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const override; - const uint16_t *getScratchRegisters(CallingConv::ID CC) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; /// Utility function to emit atomic-load-arith operations (and, or, xor, /// nand, max, min, umax, umin). It takes the corresponding instruction to @@ -987,11 +1004,12 @@ namespace llvm { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent, for use with the given x86 condition code. - SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const; + SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl, + SelectionDAG &DAG) const; /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent, for use with the given x86 condition code. - SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const; /// Convert a comparison if required by the subtarget. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2c5edf6..37bcc52 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -209,12 +209,12 @@ def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128X:$dst, (X86insrtps VR128X:$src1, + [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; @@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512me X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), + (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), + (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx), + (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), + (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx), + (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))), + (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx), + (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), + (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // @@ -984,6 +1000,10 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; + def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; + def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; } // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { @@ -1356,6 +1376,23 @@ defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load, "vmovdqu64", SSEPackedInt, v8i64>, XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, + (v16i32 immAllZerosV), GR16:$mask)), + (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, + (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src), + GR16:$mask), + (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src), + GR8:$mask), + (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)))), @@ -3112,6 +3149,17 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; +def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), @@ -3715,7 +3763,7 @@ defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, EVEX_CD8<32, CD8VF>; def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), - imm:$src2, (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), + imm:$src2, (v16f32 VR512:$src1), (i16 -1), FROUND_CURRENT)), (VRNDSCALEPSZr VR512:$src1, imm:$src2)>; @@ -3725,7 +3773,7 @@ defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), - imm:$src2, (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), + imm:$src2, (v8f64 VR512:$src1), (i8 -1), FROUND_CURRENT)), (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; @@ -3807,7 +3855,13 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), []>, EVEX; - def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), + def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), + (ins KRC:$mask, srcRC:$src), + !strconcat(OpcodeStr, + " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + []>, EVEX, EVEX_K; + + def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins KRC:$mask, srcRC:$src), !strconcat(OpcodeStr, " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), @@ -3816,6 +3870,12 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>, EVEX; + + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, KRC:$mask, srcRC:$src), + !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), + []>, EVEX, EVEX_K; + } defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; @@ -3855,60 +3915,86 @@ def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBkrr VK16WM:$mask, VR512:$src)>; + (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWkrr VK16WM:$mask, VR512:$src)>; + (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWkrr VK8WM:$mask, VR512:$src)>; + (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDkrr VK8WM:$mask, VR512:$src)>; + (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; -multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC, - RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT, ValueType InVT> { +multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode, + PatFrag mem_frag, X86MemOperand x86memop, + ValueType OpVT, ValueType InVT> { def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX; - def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), + + def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), + (ins KRC:$mask, SrcRC:$src), + !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + []>, EVEX, EVEX_K; + + def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), + (ins KRC:$mask, SrcRC:$src), + !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + []>, EVEX, EVEX_KZ; + + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>, EVEX; + + def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), + (ins KRC:$mask, x86memop:$src), + !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + []>, + EVEX, EVEX_K; + + def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), + (ins KRC:$mask, x86memop:$src), + !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + []>, + EVEX, EVEX_KZ; + } } -defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, +defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext, memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, +defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext, memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, +defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext, memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, +defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext, memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, +defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext, memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; - -defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, + +defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext, memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, +defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext, memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, +defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext, memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, +defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext, memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, +defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext, memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; @@ -3984,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; +// prefetch +multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, + RegisterClass KRC, X86MemOperand memop> { + let Predicates = [HasPFI], hasSideEffects = 1 in + def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), + !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"), + []>, EVEX, EVEX_K; +} + +defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations @@ -4200,3 +4342,19 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, GR8:$mask), (VPCONFLICTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; + +def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; + +def : Pat<(store VK1:$src, addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>; + +def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1; +}]>; + +def : Pat<(truncstorei1 GR8:$src, addr:$dst), + (MOV8mr addr:$dst, GR8:$src)>; + diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index aaef4a4..e421f8c 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -52,7 +52,8 @@ struct X86AddressMode { unsigned GVOpFlags; X86AddressMode() - : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) { + : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr), + GVOpFlags(0) { Base.Reg = 0; } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 401849f..34d8fb9 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1187,9 +1187,9 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); APInt KnownZero0, KnownOne0; - CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0); + CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0); APInt KnownZero1, KnownOne1; - CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0); + CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0); return (~KnownZero0 & ~KnownZero1) == 0; }]>; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index df6c9da..c0a6864 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -19,8 +19,9 @@ let Constraints = "$src1 = $dst" in { multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, + bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator Op = null_frag> { - let usesCustomInserter = 1 in + let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -28,7 +29,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = 1 in + let mayLoad = 1, isCommutable = IsMVariantCommutable in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -36,7 +37,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; - let usesCustomInserter = 1 in + let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -44,7 +45,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>, VEX_L; - let mayLoad = 1 in + let mayLoad = 1, isCommutable = IsMVariantCommutable in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -59,18 +60,27 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - let isCommutable = 1 in + // For 213, both the register and memory variant are commutable. + // Indeed, the commutable operands are 1 and 2 and both live in registers + // for both variants. defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; + MemFrag128, MemFrag256, OpTy128, OpTy256, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1, + Op>; let neverHasSideEffects = 1 in { defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; - let isCommutable = 1 in + // For 231, only the register variant is commutable. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, it cannot be swapped with 2. defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256>; + MemFrag128, MemFrag256, OpTy128, OpTy256, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 0>; } // neverHasSideEffects = 1 } @@ -119,8 +129,9 @@ let ExeDomain = SSEPackedDouble in { let Constraints = "$src1 = $dst" in { multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, ValueType OpVT, PatFrag mem_frag, + bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator OpNode = null_frag> { - let usesCustomInserter = 1 in + let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, @@ -128,7 +139,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; - let mayLoad = 1 in + let mayLoad = 1, isCommutable = IsMVariantCommutable in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, @@ -147,14 +158,21 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, let neverHasSideEffects = 1 in { defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC, OpVT, mem_frag>; - let isCommutable = 1 in + // See the other defm of r231 for the explanation regarding the + // commutable flags. defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC, OpVT, mem_frag>; + x86memop, RC, OpVT, mem_frag, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 0>; } -let isCommutable = 1 in +// See the other defm of r213 for the explanation regarding the +// commutable flags. defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpVT, mem_frag, OpNode>; + x86memop, RC, OpVT, mem_frag, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1, + OpNode>; } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 486e5a9..1582f43 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -81,7 +81,7 @@ def X86pinsrb : SDNode<"X86ISD::PINSRB", def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; -def X86insrtps : SDNode<"X86ISD::INSERTPS", +def X86insertps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", @@ -175,6 +175,9 @@ def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>>; +def X86pmuldq : SDNode<"X86ISD::PMULDQ", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 6450f2a..6993577 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -36,11 +36,13 @@ #include "llvm/Target/TargetOptions.h" #include <limits> +using namespace llvm; + +#define DEBUG_TYPE "x86-instr-info" + #define GET_INSTRINFO_CTOR_DTOR #include "X86GenInstrInfo.inc" -using namespace llvm; - static cl::opt<bool> NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions")); @@ -1511,12 +1513,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const { - if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() && - MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() && - MI->getOperand(Op+1).getImm() == 1 && - MI->getOperand(Op+2).getReg() == 0 && - MI->getOperand(Op+3).getImm() == 0) { - FrameIndex = MI->getOperand(Op).getIndex(); + if (MI->getOperand(Op+X86::AddrBaseReg).isFI() && + MI->getOperand(Op+X86::AddrScaleAmt).isImm() && + MI->getOperand(Op+X86::AddrIndexReg).isReg() && + MI->getOperand(Op+X86::AddrDisp).isImm() && + MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 && + MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 && + MI->getOperand(Op+X86::AddrDisp).getImm() == 0) { + FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex(); return true; } return false; @@ -1680,15 +1684,16 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::FsMOVAPSrm: case X86::FsMOVAPDrm: { // Loads from constant pools are trivially rematerializable. - if (MI->getOperand(1).isReg() && - MI->getOperand(2).isImm() && - MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && + if (MI->getOperand(1+X86::AddrBaseReg).isReg() && + MI->getOperand(1+X86::AddrScaleAmt).isImm() && + MI->getOperand(1+X86::AddrIndexReg).isReg() && + MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && MI->isInvariantLoad(AA)) { - unsigned BaseReg = MI->getOperand(1).getReg(); + unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. - if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal()) + if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal()) return false; const MachineFunction &MF = *MI->getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1699,13 +1704,14 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::LEA32r: case X86::LEA64r: { - if (MI->getOperand(2).isImm() && - MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && - !MI->getOperand(4).isReg()) { + if (MI->getOperand(1+X86::AddrScaleAmt).isImm() && + MI->getOperand(1+X86::AddrIndexReg).isReg() && + MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && + !MI->getOperand(1+X86::AddrDisp).isReg()) { // lea fi#, lea GV, etc. are all rematerializable. - if (!MI->getOperand(1).isReg()) + if (!MI->getOperand(1+X86::AddrBaseReg).isReg()) return true; - unsigned BaseReg = MI->getOperand(1).getReg(); + unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. @@ -1722,12 +1728,8 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, return true; } -/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that -/// would clobber the EFLAGS condition register. Note the result may be -/// conservative. If it cannot definitely determine the safety after visiting -/// a few instructions in each direction it assumes it's not safe. -static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { +bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { MachineBasicBlock::iterator E = MBB.end(); // For compile time consideration, if we are not able to determine the @@ -1998,7 +2000,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); unsigned leaInReg2 = 0; - MachineInstr *InsMI2 = 0; + MachineInstr *InsMI2 = nullptr; if (Src == Src2) { // ADD16rr %reg1028<kill>, %reg1028 // just a single insert_subreg. @@ -2062,14 +2064,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // convert them to equivalent lea if the condition code register def's // are dead! if (hasLiveCondCodeDef(MI)) - return 0; + return nullptr; MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI->getOperand(0); const MachineOperand &Src = MI->getOperand(1); - MachineInstr *NewMI = NULL; + MachineInstr *NewMI = nullptr; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When // we have better subtarget support, enable the 16-bit LEA generation here. // 16-bit LEA is also slow on Core2. @@ -2080,11 +2082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, switch (MIOpc) { case X86::SHUFPSrri: { assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); - if (B != C) return 0; + if (B != C) return nullptr; unsigned M = MI->getOperand(3).getImm(); NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) .addOperand(Dest).addOperand(Src).addImm(M); @@ -2092,11 +2094,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHUFPDrri: { assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); - if (B != C) return 0; + if (B != C) return nullptr; unsigned M = MI->getOperand(3).getImm(); // Convert to PSHUFD mask. @@ -2109,13 +2111,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && !MF.getRegInfo().constrainRegClass(Src.getReg(), &X86::GR64_NOSPRegClass)) - return 0; + return nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest) @@ -2125,7 +2127,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL32ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; @@ -2135,7 +2137,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2151,10 +2153,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL16ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest) .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); @@ -2163,7 +2165,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, default: { switch (MIOpc) { - default: return 0; + default: return nullptr; case X86::INC64r: case X86::INC32r: case X86::INC64_32r: { @@ -2175,7 +2177,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2189,7 +2191,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::INC16r: case X86::INC64_16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), 1); @@ -2206,7 +2209,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2221,7 +2224,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::DEC16r: case X86::DEC64_16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), -1); @@ -2242,7 +2246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; const MachineOperand &Src2 = MI->getOperand(2); bool isKill2, isUndef2; @@ -2250,7 +2254,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, isUndef2, ImplicitOp2)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest); @@ -2272,7 +2276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD16rr: case X86::ADD16rr_DB: { if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); @@ -2311,7 +2316,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2327,7 +2332,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD16ri_DB: case X86::ADD16ri8_DB: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), @@ -2337,7 +2343,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } } - if (!NewMI) return 0; + if (!NewMI) return nullptr; if (LV) { // Update live variables if (Src.isKill()) @@ -2789,11 +2795,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, std::next(I)->eraseFromParent(); Cond.clear(); - FBB = 0; + FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = 0; + TBB = nullptr; I->eraseFromParent(); I = MBB.end(); UnCondBrIter = MBB.end(); @@ -3549,6 +3555,26 @@ inline static bool isDefConvertible(MachineInstr *MI) { } } +/// isUseDefConvertible - check whether the use can be converted +/// to remove a comparison against zero. +static X86::CondCode isUseDefConvertible(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return X86::COND_INVALID; + case X86::LZCNT16rr: case X86::LZCNT16rm: + case X86::LZCNT32rr: case X86::LZCNT32rm: + case X86::LZCNT64rr: case X86::LZCNT64rm: + return X86::COND_B; + case X86::POPCNT16rr:case X86::POPCNT16rm: + case X86::POPCNT32rr:case X86::POPCNT32rm: + case X86::POPCNT64rr:case X86::POPCNT64rm: + return X86::COND_E; + case X86::TZCNT16rr: case X86::TZCNT16rm: + case X86::TZCNT32rr: case X86::TZCNT32rm: + case X86::TZCNT64rr: case X86::TZCNT64rm: + return X86::COND_B; + } +} + /// optimizeCompareInstr - Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. @@ -3615,13 +3641,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // If we are comparing against zero, check whether we can use MI to update // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); - if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() || - !isDefConvertible(MI))) + if (IsCmpZero && MI->getParent() != CmpInstr->getParent()) return false; + // If we have a use of the source register between the def and our compare + // instruction we can eliminate the compare iff the use sets EFLAGS in the + // right way. + bool ShouldUpdateCC = false; + X86::CondCode NewCC = X86::COND_INVALID; + if (IsCmpZero && !isDefConvertible(MI)) { + // Scan forward from the use until we hit the use we're looking for or the + // compare instruction. + for (MachineBasicBlock::iterator J = MI;; ++J) { + // Do we have a convertible instruction? + NewCC = isUseDefConvertible(J); + if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() && + J->getOperand(1).getReg() == SrcReg) { + assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!"); + ShouldUpdateCC = true; // Update CC later on. + // This is not a def of SrcReg, but still a def of EFLAGS. Keep going + // with the new def. + MI = Def = J; + break; + } + + if (J == I) + return false; + } + } + // We are searching for an earlier instruction that can make CmpInstr // redundant and that instruction will be saved in Sub. - MachineInstr *Sub = NULL; + MachineInstr *Sub = nullptr; const TargetRegisterInfo *TRI = &getRegisterInfo(); // We iterate backward, starting from the instruction before CmpInstr and @@ -3634,7 +3685,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, RE = CmpInstr->getParent() == MI->getParent() ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : CmpInstr->getParent()->rend(); - MachineInstr *Movr0Inst = 0; + MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { MachineInstr *Instr = &*RI; // Check whether CmpInstr can be made redundant by the current instruction. @@ -3716,13 +3767,28 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // CF and OF are used, we can't perform this optimization. return false; } + + // If we're updating the condition code check if we have to reverse the + // condition. + if (ShouldUpdateCC) + switch (OldCC) { + default: + return false; + case X86::COND_E: + break; + case X86::COND_NE: + NewCC = GetOppositeBranchCondition(NewCC); + break; + } } else if (IsSwapped) { // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. // We swap the condition code and synthesize the new opcode. - X86::CondCode NewCC = getSwappedCondition(OldCC); + NewCC = getSwappedCondition(OldCC); if (NewCC == X86::COND_INVALID) return false; + } + if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) { // Synthesize the new opcode. bool HasMemoryOperand = Instr.hasOneMemOperand(); unsigned NewOpc; @@ -3809,19 +3875,19 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) - return 0; + return nullptr; // To be conservative, if there exists another load, clear the load candidate. if (MI->mayLoad()) { FoldAsLoadDefReg = 0; - return 0; + return nullptr; } // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); bool SawStore = false; - if (!DefMI->isSafeToMove(this, 0, SawStore)) - return 0; + if (!DefMI->isSafeToMove(this, nullptr, SawStore)) + return nullptr; // We try to commute MI if possible. unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; @@ -3838,12 +3904,12 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, continue; // Do not fold if we have a subreg use or a def or multiple uses. if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return 0; + return nullptr; SrcOperandId = i; FoundSrcOperand = true; } - if (!FoundSrcOperand) return 0; + if (!FoundSrcOperand) return nullptr; // Check whether we can fold the def into SrcOperandId. SmallVector<unsigned, 8> Ops; @@ -3857,22 +3923,22 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, if (Idx == 1) { // MI was changed but it didn't help, commute it back! commuteInstruction(MI, false); - return 0; + return nullptr; } // Check whether we can commute MI and enable folding. if (MI->isCommutable()) { MachineInstr *NewMI = commuteInstruction(MI, false); // Unable to commute. - if (!NewMI) return 0; + if (!NewMI) return nullptr; if (NewMI != MI) { // New instruction. It doesn't need to be kept. NewMI->eraseFromParent(); - return 0; + return nullptr; } } } - return 0; + return nullptr; } /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr @@ -4007,7 +4073,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Align) const { - const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + const DenseMap<unsigned, + std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect(); bool isTwoAddrFold = false; @@ -4015,7 +4082,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // when X86Subtarget is Atom. if (isCallRegIndirect && (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { - return NULL; + return nullptr; } unsigned NumOps = MI->getDesc().getNumOperands(); @@ -4026,9 +4093,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. if (MI->getOpcode() == X86::ADD32ri && MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return NULL; + return nullptr; - MachineInstr *NewMI = NULL; + MachineInstr *NewMI = nullptr; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -4063,7 +4130,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned Opcode = I->second.first; unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; if (Align < MinAlign) - return NULL; + return nullptr; bool NarrowToMOV32rm = false; if (Size) { unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); @@ -4071,12 +4138,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) - return NULL; + return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is due // to liveintervalanalysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) - return NULL; + return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } @@ -4105,7 +4172,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; - return NULL; + return nullptr; } /// hasPartialRegUpdate - Return true for all instructions that only update @@ -4270,14 +4337,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, int FrameIndex) const { // Check switch flag - if (NoFusing) return NULL; + if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) - return 0; + return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Size = MFI->getObjectSize(FrameIndex); @@ -4290,7 +4357,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned NewOpc = 0; unsigned RCSize = 0; switch (MI->getOpcode()) { - default: return NULL; + default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; @@ -4299,12 +4366,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Size < RCSize) - return NULL; + return nullptr; // Change to CMPXXri r, 0 first. MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) - return NULL; + return nullptr; SmallVector<MachineOperand,4> MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); @@ -4322,14 +4389,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); // Check switch flag - if (NoFusing) return NULL; + if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) - return 0; + return nullptr; // Determine the alignment of the load. unsigned Alignment = 0; @@ -4352,12 +4419,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 4; break; default: - return 0; + return nullptr; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; switch (MI->getOpcode()) { - default: return NULL; + default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; @@ -4367,12 +4434,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) - return NULL; + return nullptr; // Make sure the subregisters match. // Otherwise we risk changing the size of the load. if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) - return NULL; + return nullptr; SmallVector<MachineOperand,X86::AddrNumOperands> MOs; switch (LoadMI->getOpcode()) { @@ -4388,7 +4455,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Medium and large mode can't fold loads this way. if (TM.getCodeModel() != CodeModel::Small && TM.getCodeModel() != CodeModel::Kernel) - return NULL; + return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; @@ -4400,7 +4467,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // This doesn't work for several reasons. // 1. GlobalBaseReg may have been spilled. // 2. It may not be live at MI. - return NULL; + return nullptr; } // Create a constant-pool entry. @@ -4436,14 +4503,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, > 4) // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes). - return NULL; + return nullptr; if ((LoadMI->getOpcode() == X86::MOVSDrm || LoadMI->getOpcode() == X86::VMOVSDrm) && MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() > 8) // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes). - return NULL; + return nullptr; // Folding a normal load. Just copy the load's address operands. for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -4489,7 +4556,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. - const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + const DenseMap<unsigned, + std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 @@ -4671,7 +4739,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, AddrOps.push_back(Chain); // Emit the load instruction. - SDNode *Load = 0; + SDNode *Load = nullptr; if (FoldedLoad) { EVT VT = *RC->vt_begin(); std::pair<MachineInstr::mmo_iterator, @@ -4696,7 +4764,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, // Emit the data processing instruction. std::vector<EVT> VTs; - const TargetRegisterClass *DstRC = 0; + const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); VTs.push_back(*DstRC->vt_begin()); @@ -5190,14 +5258,14 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) if (ReplaceableInstrs[i][domain-1] == opcode) return ReplaceableInstrs[i]; - return 0; + return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) if (ReplaceableInstrsAVX2[i][domain-1] == opcode) return ReplaceableInstrsAVX2[i]; - return 0; + return nullptr; } std::pair<uint16_t, uint16_t> @@ -5327,8 +5395,10 @@ namespace { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF.getTarget()); - assert(!TM->getSubtarget<X86Subtarget>().is64Bit() && - "X86-64 PIC uses RIP relative addressing"); + // Don't do anything if this is 64-bit as 64-bit PIC + // uses RIP relative addressing. + if (TM->getSubtarget<X86Subtarget>().is64Bit()) + return false; // Only emit a global base reg in PIC mode. if (TM->getRelocationModel() != Reloc::PIC_) @@ -5383,7 +5453,7 @@ namespace { char CGBR::ID = 0; FunctionPass* -llvm::createGlobalBaseRegPass() { return new CGBR(); } +llvm::createX86GlobalBaseRegPass() { return new CGBR(); } namespace { struct LDTLSCleanup : public MachineFunctionPass { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 156291e..5f34915 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -325,7 +325,7 @@ public: /// value. unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = 0) const override; + unsigned *LoadRegIndex = nullptr) const override; /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler /// to determine if two loads are loading from the same base address. It @@ -359,6 +359,13 @@ public: /// instruction that defines the specified register class. bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha + /// would clobber the EFLAGS condition register. Note the result may be + /// conservative. If it cannot definitely determine the safety after visiting + /// a few instructions in each direction it assumes it's not safe. + bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + static bool isX86_64ExtendedReg(const MachineOperand &MO) { if (!MO.isReg()) return false; return X86II::isX86_64ExtendedReg(MO.getReg()); diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 8edf873..0d97669 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -206,6 +206,8 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -249,7 +251,6 @@ def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; -def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntShiftOp>; def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; @@ -2001,6 +2002,46 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in { (implicit EFLAGS)]>, XS; } +let Predicates = [HasLZCNT] in { + def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E), + (X86cmp GR16:$src, (i16 0))), + (LZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E), + (X86cmp GR32:$src, (i32 0))), + (LZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E), + (X86cmp GR64:$src, (i64 0))), + (LZCNT64rr GR64:$src)>; + def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E), + (X86cmp GR16:$src, (i16 0))), + (LZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E), + (X86cmp GR32:$src, (i32 0))), + (LZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E), + (X86cmp GR64:$src, (i64 0))), + (LZCNT64rr GR64:$src)>; + + def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E), + (X86cmp (loadi16 addr:$src), (i16 0))), + (LZCNT16rm addr:$src)>; + def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E), + (X86cmp (loadi32 addr:$src), (i32 0))), + (LZCNT32rm addr:$src)>; + def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E), + (X86cmp (loadi64 addr:$src), (i64 0))), + (LZCNT64rm addr:$src)>; + def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E), + (X86cmp (loadi16 addr:$src), (i16 0))), + (LZCNT16rm addr:$src)>; + def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E), + (X86cmp (loadi32 addr:$src), (i32 0))), + (LZCNT32rm addr:$src)>; + def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E), + (X86cmp (loadi64 addr:$src), (i64 0))), + (LZCNT64rm addr:$src)>; +} + //===----------------------------------------------------------------------===// // BMI Instructions // @@ -2077,6 +2118,47 @@ let Predicates = [HasBMI] in { (BLSI64rr GR64:$src)>; } +let Predicates = [HasBMI] in { + def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E), + (X86cmp GR16:$src, (i16 0))), + (TZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E), + (X86cmp GR32:$src, (i32 0))), + (TZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E), + (X86cmp GR64:$src, (i64 0))), + (TZCNT64rr GR64:$src)>; + def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E), + (X86cmp GR16:$src, (i16 0))), + (TZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E), + (X86cmp GR32:$src, (i32 0))), + (TZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E), + (X86cmp GR64:$src, (i64 0))), + (TZCNT64rr GR64:$src)>; + + def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E), + (X86cmp (loadi16 addr:$src), (i16 0))), + (TZCNT16rm addr:$src)>; + def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E), + (X86cmp (loadi32 addr:$src), (i32 0))), + (TZCNT32rm addr:$src)>; + def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E), + (X86cmp (loadi64 addr:$src), (i64 0))), + (TZCNT64rm addr:$src)>; + def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E), + (X86cmp (loadi16 addr:$src), (i16 0))), + (TZCNT16rm addr:$src)>; + def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E), + (X86cmp (loadi32 addr:$src), (i32 0))), + (TZCNT32rm addr:$src)>; + def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E), + (X86cmp (loadi64 addr:$src), (i64 0))), + (TZCNT64rm addr:$src)>; +} + + multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { @@ -2104,18 +2186,38 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { int_x86_bmi_bzhi_64, loadi64>, VEX_W; } -def : Pat<(X86bzhi GR32:$src1, GR8:$src2), - (BZHI32rr GR32:$src1, - (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; -def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2), - (BZHI32rm addr:$src1, - (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; -def : Pat<(X86bzhi GR64:$src1, GR8:$src2), - (BZHI64rr GR64:$src1, - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; -def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2), - (BZHI64rm addr:$src1, - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + +def CountTrailingOnes : SDNodeXForm<imm, [{ + // Count the trailing ones in the immediate. + return getI8Imm(CountTrailingOnes_64(N->getZExtValue())); +}]>; + +def BZHIMask : ImmLeaf<i64, [{ + return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32); +}]>; + +let Predicates = [HasBMI2] in { + def : Pat<(and GR64:$src, BZHIMask:$mask), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; + + def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)), + (BZHI32rr GR32:$src, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)), + (BZHI32rm addr:$src, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; +} // HasBMI2 let Predicates = [HasBMI] in { def : Pat<(X86bextr GR32:$src1, GR32:$src2), @@ -2617,21 +2719,21 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but // this is compatible with what GAS does. -def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst)>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst)>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst)>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst)>, Requires<[In16BitMode]>; - -def : InstAlias<"call *$dst", (CALL64m i16mem:$dst)>, Requires<[In64BitMode]>; -def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst)>, Requires<[In64BitMode]>; -def : InstAlias<"call *$dst", (CALL32m i16mem:$dst)>, Requires<[In32BitMode]>; -def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst)>, Requires<[In32BitMode]>; -def : InstAlias<"call *$dst", (CALL16m i16mem:$dst)>, Requires<[In16BitMode]>; -def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst)>, Requires<[In16BitMode]>; +def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"call *$dst", (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call *$dst", (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call *$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul <imm>, B" is an alias for "imul <imm>, B, B". @@ -2664,11 +2766,11 @@ def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>; -def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; +def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; +def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. def : InstAlias<"movq $src, $dst", @@ -2705,7 +2807,7 @@ def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity // errors, since its encoding is the most compact. -def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>; // shld/shrd op,op -> shld op, op, CL def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; @@ -2751,19 +2853,29 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. -def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", + (TEST8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", + (TEST16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", + (TEST32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", + (TEST64rm GR64:$val, i64mem:$mem), 0>; // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. -def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", + (XCHG8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", + (XCHG16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", + (XCHG32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", + (XCHG64rm GR64:$val, i64mem:$mem), 0>; // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. -def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>; -def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[Not64BitMode]>; -def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; -def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>; +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", + (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", + (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 050ee39..ecf80a1 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -254,6 +254,11 @@ let neverHasSideEffects = 1 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; +} } // SchedRW let SchedRW = [WriteLoad] in { @@ -262,11 +267,12 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (load_mmx addr:$src))], IIC_MMX_MOVQ_RM>; +} // SchedRW +let SchedRW = [WriteStore] in def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (x86mmx VR64:$src), addr:$dst)], IIC_MMX_MOVQ_RM>; -} // SchedRW let SchedRW = [WriteMove] in { def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f2f3967..1eb0485 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1561,9 +1561,9 @@ defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, let Predicates = [UseAVX] in { def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>; + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; + (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -1627,9 +1627,9 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", - (CVTSI2SSrm FR64:$dst, i32mem:$src)>; + (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>; def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", - (CVTSI2SDrm FR64:$dst, i32mem:$src)>; + (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>; // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). @@ -2005,7 +2005,7 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrr VR128:$dst, VR128:$src)>; + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2024,7 +2024,7 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; } def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -2127,7 +2127,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrr VR128:$dst, VR128:$src)>; + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq @@ -2146,7 +2146,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; let Predicates = [HasAVX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), @@ -2252,7 +2252,7 @@ def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrr VR128:$dst, VR128:$src)>; + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2psx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2271,7 +2271,7 @@ def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", @@ -2973,6 +2973,19 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; +// AVX1 requires type coercions in order to fold loads directly into logical +// operations. +let Predicates = [HasAVX1Only] in { + def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), + (VANDNPSYrm VR256:$src1, addr:$src2)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Arithmetic Instructions //===----------------------------------------------------------------------===// @@ -3144,23 +3157,23 @@ let Predicates = [UseSSE2] in { let Predicates = [UseSSE41] in { // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insrtps rather than a X86Movss. + // instruction is lowered into a X86insertps rather than a X86Movss. // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insrtps. + // make sure that we correctly match the X86insertps. - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3186,19 +3199,19 @@ let Predicates = [HasAVX] in { (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -4068,6 +4081,10 @@ defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, SSE_INTALUQ_ITINS_P, 1>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SSE_INTMUL_ITINS_P, 1>; +defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; +defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, SSE_INTALU_ITINS_P, 0>; defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, @@ -4102,10 +4119,6 @@ defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; -defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, - int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>; -defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, - int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, @@ -6515,7 +6528,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, + (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), @@ -6524,7 +6537,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insrtps VR128:$src1, + (X86insertps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), imm:$src3))], itins.rm>, Sched<[WriteFShuffleLd, ReadAfterLd]>; @@ -6537,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } +let Predicates = [UseSSE41] in { + // If we're inserting an element from a load or a null pshuf of a load, + // fold the load into the insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 + (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), + imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd + (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + +let Predicates = [UseAVX] in { + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// @@ -6990,6 +7026,31 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst +/// types. +multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, @@ -7018,8 +7079,9 @@ let Predicates = [HasAVX] in { defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, - 0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V; + defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, + VR128, loadv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -7051,9 +7113,9 @@ let Predicates = [HasAVX2] in { defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", - int_x86_avx2_pmul_dq, WriteVecIMul>, - VEX_4V, VEX_L; + defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, + VR256, loadv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7076,8 +7138,9 @@ let Constraints = "$src1 = $dst" in { memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, - 1, SSE_INTMUL_ITINS_P>; + defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; } let Predicates = [HasAVX] in { @@ -7394,6 +7457,7 @@ let Predicates = [UseSSE41] in { } +let SchedRW = [WriteLoad] in { let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", @@ -7407,6 +7471,7 @@ def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; +} // SchedRW //===----------------------------------------------------------------------===// // SSE4.2 - Compare Instructions @@ -7831,18 +7896,20 @@ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), multiclass pclmul_alias<string asm, int immop> { def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>; + (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>; + (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>; + (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), + 0>; def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>; + (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), + 0>; } defm : pclmul_alias<"hqhq", 0x11>; defm : pclmul_alias<"hqlq", 0x01>; @@ -8291,6 +8358,12 @@ let Predicates = [HasF16C] in { defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 9d3aa1c..b5595cb 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -19,7 +19,7 @@ let Defs = [RAX, RDX] in TB; let Defs = [RAX, RCX, RDX] in - def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; // CPU flow control instructions diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index e99f2d9..e969ef2 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "jit" #include "X86JITInfo.h" #include "X86Relocations.h" #include "X86Subtarget.h" @@ -24,6 +23,8 @@ #include <cstring> using namespace llvm; +#define DEBUG_TYPE "jit" + // Determine the platform we're running on #if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64) # define X86_64_JIT @@ -427,9 +428,14 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { TsanIgnoreWritesEnd(); #if defined (X86_32_JIT) && !defined (_MSC_VER) +#if defined(__SSE__) + // SSE Callback should be called for SSE-enabled LLVM. + return X86CompilationCallback_SSE; +#else if (Subtarget->hasSSE1()) return X86CompilationCallback_SSE; #endif +#endif return X86CompilationCallback; } @@ -437,7 +443,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) { Subtarget = &TM.getSubtarget<X86Subtarget>(); useGOT = 0; - TLSOffset = 0; + TLSOffset = nullptr; } void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 6d7f3cb..0190080 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -120,7 +120,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { case X86II::MO_DARWIN_NONLAZY_PIC_BASE: { MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI().getGVStubEntry(Sym); - if (StubSym.getPointer() == 0) { + if (!StubSym.getPointer()) { assert(MO.isGlobal() && "Extern symbol not handled yet"); StubSym = MachineModuleInfoImpl:: @@ -132,7 +132,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: { MachineModuleInfoImpl::StubValueTy &StubSym = getMachOMMI().getHiddenGVStubEntry(Sym); - if (StubSym.getPointer() == 0) { + if (!StubSym.getPointer()) { assert(MO.isGlobal() && "Extern symbol not handled yet"); StubSym = MachineModuleInfoImpl:: @@ -168,7 +168,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { // FIXME: We would like an efficient form for this, so we don't have to do a // lot of extra uniquing. - const MCExpr *Expr = 0; + const MCExpr *Expr = nullptr; MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; switch (MO.getTargetFlags()) { @@ -223,7 +223,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, break; } - if (Expr == 0) + if (!Expr) Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 746d0d6..6639875 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -15,9 +15,9 @@ #include <algorithm> -#define DEBUG_TYPE "x86-pad-short-functions" #include "X86.h" #include "X86InstrInfo.h" +#include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -30,6 +30,8 @@ using namespace llvm; +#define DEBUG_TYPE "x86-pad-short-functions" + STATISTIC(NumBBsPadded, "Number of basic blocks padded"); namespace { @@ -49,7 +51,7 @@ namespace { struct PadShortFunc : public MachineFunctionPass { static char ID; PadShortFunc() : MachineFunctionPass(ID) - , Threshold(4), TM(0), TII(0) {} + , Threshold(4), TM(nullptr), TII(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -100,6 +102,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { } TM = &MF.getTarget(); + if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) + return false; + TII = TM->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 85aa9b5..a83e1e4 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -38,11 +38,11 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +using namespace llvm; + #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" -using namespace llvm; - cl::opt<bool> ForceStackAlign("force-align-stack", cl::desc("Force align the stack to the minimum alignment" @@ -129,7 +129,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, if (!Is64Bit && SubIdx == X86::sub_8bit) { A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); if (!A) - return 0; + return nullptr; } return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } @@ -231,7 +231,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } -const uint16_t * +const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 6a71113..2289d91 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -100,7 +100,7 @@ public: /// getCalleeSavedRegs - Return a null-terminated list of all of the /// callee-save registers on this target. - const uint16_t * + const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; const uint32_t *getCallPreservedMask(CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const; @@ -122,7 +122,7 @@ public: void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS = NULL) const override; + RegScavenger *RS = nullptr) const override; // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index f5b51ee..6966d61 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel { let LoadLatency = 4; let MispredictPenalty = 16; + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow // the scheduler to assign a default model to unrecognized opcodes. let CompleteModel = 0; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index a58859a..83f0534 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel { let LoadLatency = 4; let MispredictPenalty = 16; + // Based on the LSD (loop-stream detector) queue size. + let LoopMicroOpBufferSize = 28; + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow // the scheduler to assign a default model to unrecognized opcodes. let CompleteModel = 0; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index ba72f29..3256ee7 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel { let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 6c2a304..823d101 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -1,4 +1,4 @@ -//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -7,662 +7,225 @@ // //===----------------------------------------------------------------------===// // -// This file defines the itinerary class data for the Intel Atom -// (Silvermont) processor. +// This file defines the machine model for Intel Silvermont to support +// instruction scheduling and other instruction cost heuristics. // //===----------------------------------------------------------------------===// -def IEC_RSV0 : FuncUnit; -def IEC_RSV1 : FuncUnit; -def FPC_RSV0 : FuncUnit; -def FPC_RSV1 : FuncUnit; -def MEC_RSV : FuncUnit; - - - - - - - - - - - - - - -def SLMItineraries : ProcessorItineraries< - [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ], - [], [ - // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] - // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>] - // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] - // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>] - // - // Default is 1 cycle, IEC_RSV0 or IEC_RSV1 - //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // mul - InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - // imul by al, ax, eax, rax - InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - // imul reg by reg|mem - InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - // imul reg = reg/mem * imm - InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - // idiv - min latency - InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >, - // div - min latency - InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<25, [MEC_RSV]>] >, - InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >, - // neg/not/inc/dec - InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - // add/sub/and/or/xor/adc/sbc/cmp/test - InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - // adc/sbb - InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - // shift/rotate - InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>, - InstrStage<1, [MEC_RSV]>] >, - // shift double - InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, - InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, - InstrStage<4, [MEC_RSV]>] >, - // cmov - InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, - // set - InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // jcc - InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // jcxz/jecxz/jrcxz - InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // jmp rel - InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // jmp indirect - InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - // jmp far - InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // loop/loope/loopne - InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // call - all but reg/imm - InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - //ret - InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - //sign extension movs - InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - //zero extension movs - InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - // SSE binary operations - // arithmetic fp scalar - InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<13, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<13, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - - // arithmetic fp parallel - InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<27, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<27, [MEC_RSV]>] >, - - // bitwise parallel - InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - // arithmetic int parallel - InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - - // multiply int parallel - InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>, - InstrStage<5, [MEC_RSV]>] >, - - // shift parallel - InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>, - InstrStage<2, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >, - - InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >, - - InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >, - - InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>, - InstrStage<26, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>, - InstrStage<13, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>, - InstrStage<26, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>, - InstrStage<13, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>, - InstrStage<9, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >, - InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>, - InstrStage<4, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - - InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<9, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<9, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<9, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<5, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - - InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - // conversions - // to/from PD ... - InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<5, [MEC_RSV]>] >, - // to/from PS except to/from PD and PS2PI - InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - - // MMX MOVs - InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // other MMX - InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PSADBW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PEXTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // conversions - // from/to PD - InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // from/to PI - InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - - InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +def SLMModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SLM can decode 2 + // instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 32; // Based on the reorder buffer. + let LoadLatency = 3; + let MispredictPenalty = 10; + + // For small loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + + // FIXME: SSE4 is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} - InstrItinData<IIC_FST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FIST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +let SchedModel = SLMModel in { + +// Silvermont has 5 reservation stations for micro-ops + +def IEC_RSV0 : ProcResource<1>; +def IEC_RSV1 : ProcResource<1>; +def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; } +def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; } +def MEC_RSV : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>; +def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>; + +def SMDivider : ProcResource<1>; +def SMFPMultiplier : ProcResource<1>; +def SMFPDivider : ProcResource<1>; + +// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> { + let Latency = !add(Lat, 3); + } +} - InstrItinData<IIC_FLDZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FUCOM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FLDCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FFREE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_WAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FXAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FNOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FLDL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_F2XM1, [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FYL2X, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FPTAN, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FPATAN, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FXTRACT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FPREM1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FPSTP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FPREM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FYL2XP1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FSINCOS, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FRNDINT, [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FSCALE, [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_FCOMPP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FXSAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FXRSTOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, +// A folded store needs a cycle on MEC_RSV for the store data, but it does not +// need an extra port cycle to recompute the address. +def : WriteRes<WriteRMW, [MEC_RSV]>; + +def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>; +def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteMove, [IEC_RSV01]>; +def : WriteRes<WriteZero, []>; + +defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>; +defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>; +defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>; +defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [IEC_RSV1]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> { + let Latency = 25; + let ResourceCycles = [1, 25]; +} +def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 25]; +} - // System instructions - InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_INT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_INT3, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_INVD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IRET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_HLT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LXS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_RSM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SLDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_STR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// Scalar and vector floating point. +defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; +defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; +defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; +defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>; + +// This is quite rough, latency depends on precision +def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> { + let Latency = 5; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> { + let Latency = 8; + let ResourceCycles = [1, 1, 2]; +} - InstrItinData<IIC_IN_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_IN_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_INS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> { + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> { + let Latency = 37; + let ResourceCycles = [1, 1, 34]; +} - InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // worst case for mov REG_CRx - InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// Vector integer operations. +defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>; +defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>; +defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>; +defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>; + +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> { + let Latency = 13; + let ResourceCycles = [13]; +} +def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 13; + let ResourceCycles = [13, 1]; +} - InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // LAR - InstrItinData<IIC_LAR_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LAR_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // LSL - InstrItinData<IIC_LSL_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LSL_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} - InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // push control register, segment registers - InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // pop control register, segment registers - InstrItinData<IIC_POP_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // VERR, VERW - InstrItinData<IIC_VERR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // WRMSR, RDMSR - InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - // SMSW, LMSW - InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} - InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> { + let Latency = 21; + let ResourceCycles = [21]; +} +def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> { + let Latency = 21; + let ResourceCycles = [21, 1]; +} - InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} - InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, +def : WriteRes<WriteAESIMC, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} - InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<10, [MEC_RSV]>] >, - InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<5, [MEC_RSV]>] >, - InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<5, [MEC_RSV]>] >, - InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<6, [MEC_RSV]>] >, - InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >, - InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<10, [MEC_RSV]>] >, - InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, - InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, - InstrStage<3, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<12, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<15, [MEC_RSV]>] >, - InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<1, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<11, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<5, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, - InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >, - InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>, - InstrStage<4, [MEC_RSV]>] >, - InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >, - InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>, - InstrStage<10, [MEC_RSV]>] >, +def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} - InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] > - ]>; +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [FPC_RSV0]> { + let Latency = 10; + let ResourceCycles = [10]; +} +def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 10; + let ResourceCycles = [10, 1]; +} -// Silvermont machine model. -def SLMModel : SchedMachineModel { - let IssueWidth = 2; // Allows 2 instructions per scheduling group. - let MinLatency = 1; // InstrStage cycles overrides MinLatency. - // OperandCycles may be used for expected latency. - let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. - let HighLatency = 30;// Expected, may be overriden by OperandCycles. - let Itineraries = SLMItineraries; -} +def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteFence, [MEC_RSV]>; +def : WriteRes<WriteNop, []>; + +// AVX is not supported on that architecture, but we should define the basic +// scheduling resources anyway. +def : WriteRes<WriteIMulH, [FPC_RSV0]>; +defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>; +} // SchedModel diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index b9c620f..744890d 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -11,12 +11,13 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-selectiondag-info" #include "X86TargetMachine.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DerivedTypes.h" using namespace llvm; +#define DEBUG_TYPE "x86-selectiondag-info" + X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : TargetSelectionDAGInfo(TM), Subtarget(&TM.getSubtarget<X86Subtarget>()), @@ -50,7 +51,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); if (const char *bzeroEntry = V && - V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { + V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) { EVT IntPtr = TLI.getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; @@ -60,15 +61,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); - TargetLowering:: - CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()), - false, false, false, false, - 0, CallingConv::C, /*isTailCall=*/false, - /*doesNotRet=*/false, /*isReturnValueUsed=*/false, - DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, - DAG, dl); - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(CLI); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + .setDiscardResult(); + + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -77,7 +77,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, } uint64_t SizeVal = ConstantSize->getZExtValue(); - SDValue InFlag(0, 0); + SDValue InFlag; EVT AVT; SDValue Count; ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); @@ -139,7 +139,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); if (TwoRepStos) { InFlag = Chain.getValue(1); @@ -153,7 +153,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); } else if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; @@ -225,7 +225,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Count = DAG.getIntPtrConstant(CountVal); unsigned BytesLeft = SizeVal % UBytes; - SDValue InFlag(0, 0); + SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : X86::ECX, Count, InFlag); @@ -241,8 +241,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; - SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, - array_lengthof(Ops)); + SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); SmallVector<SDValue, 4> Results; Results.push_back(RepMovs); @@ -263,6 +262,5 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SrcPtrInfo.getWithOffset(Offset))); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &Results[0], Results.size()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); } diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 207d0ba..989e0d6 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "subtarget" #include "X86Subtarget.h" #include "X86InstrInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Host.h" @@ -24,15 +24,24 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#if defined(_MSC_VER) +#include <intrin.h> +#endif + +using namespace llvm; + +#define DEBUG_TYPE "subtarget" + #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "X86GenSubtargetInfo.inc" -using namespace llvm; +// Temporary option to control early if-conversion for x86 while adding machine +// models. +static cl::opt<bool> +X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, + cl::desc("Enable early if-conversion on X86")); -#if defined(_MSC_VER) -#include <intrin.h> -#endif /// ClassifyBlockAddressReference - Classify a blockaddress reference for the /// current subtarget according to how we should reference it in a non-pcrel @@ -153,7 +162,7 @@ const char *X86Subtarget::getBZeroEntry() const { !getTargetTriple().isMacOSXVersionLT(10, 6)) return "__bzero"; - return 0; + return nullptr; } bool X86Subtarget::hasSinCos() const { @@ -173,251 +182,16 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } -static bool OSHasAVXSupport() { -#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ - || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) -#if defined(__GNUC__) - // Check xgetbv; this uses a .byte sequence instead of the instruction - // directly because older assemblers do not include support for xgetbv and - // there is no easy way to conditionally compile based on the assembler used. - int rEAX, rEDX; - __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); -#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) - unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); -#else - int rEAX = 0; // Ensures we return false -#endif - return (rEAX & 6) == 6; -#else - return false; -#endif -} - -void X86Subtarget::AutoDetectSubtargetFeatures() { - unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; - unsigned MaxLevel; - union { - unsigned u[3]; - char c[12]; - } text; - - if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) || - MaxLevel < 1) - return; - - X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); - - if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } - if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); } - if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); } - if ((EDX >> 26) & 1) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE2); } - if (ECX & 0x1) { X86SSELevel = SSE3; ToggleFeature(X86::FeatureSSE3); } - if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);} - if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} - if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} - if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) { - X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); - } - - bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; - bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; - - if ((ECX >> 1) & 0x1) { - HasPCLMUL = true; - ToggleFeature(X86::FeaturePCLMUL); - } - if ((ECX >> 12) & 0x1) { - HasFMA = true; - ToggleFeature(X86::FeatureFMA); - } - if (IsIntel && ((ECX >> 22) & 0x1)) { - HasMOVBE = true; - ToggleFeature(X86::FeatureMOVBE); - } - if ((ECX >> 23) & 0x1) { - HasPOPCNT = true; - ToggleFeature(X86::FeaturePOPCNT); - } - if ((ECX >> 25) & 0x1) { - HasAES = true; - ToggleFeature(X86::FeatureAES); - } - if ((ECX >> 29) & 0x1) { - HasF16C = true; - ToggleFeature(X86::FeatureF16C); - } - if (IsIntel && ((ECX >> 30) & 0x1)) { - HasRDRAND = true; - ToggleFeature(X86::FeatureRDRAND); - } - - if ((ECX >> 13) & 0x1) { - HasCmpxchg16b = true; - ToggleFeature(X86::FeatureCMPXCHG16B); - } - - if (IsIntel || IsAMD) { - // Determine if bit test memory instructions are slow. - unsigned Family = 0; - unsigned Model = 0; - X86_MC::DetectFamilyModel(EAX, Family, Model); - if (IsAMD || (Family == 6 && Model >= 13)) { - IsBTMemSlow = true; - ToggleFeature(X86::FeatureSlowBTMem); - } - - // Determine if SHLD/SHRD instructions have higher latency then the - // equivalent series of shifts/or instructions. - // FIXME: Add Intel's processors that have SHLD instructions with very - // poor latency. - if (IsAMD) { - IsSHLDSlow = true; - ToggleFeature(X86::FeatureSlowSHLD); - } - - // If it's an Intel chip since Nehalem and not an Atom chip, unaligned - // memory access is fast. We hard code model numbers here because they - // aren't strictly increasing for Intel chips it seems. - if (IsIntel && - ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield, - // Jasper Froest - (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP - (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX - (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale - (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP - (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX - (Family == 6 && Model == 0x2A) || // SandyBridge - (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E* - (Family == 6 && Model == 0x3A) || // IvyBridge - (Family == 6 && Model == 0x3E) || // IvyBridge EP - (Family == 6 && Model == 0x3C) || // Haswell - (Family == 6 && Model == 0x3F) || // ... - (Family == 6 && Model == 0x45) || // ... - (Family == 6 && Model == 0x46))) { // ... - IsUAMemFast = true; - ToggleFeature(X86::FeatureFastUAMem); - } - - // Set processor type. Currently only Atom or Silvermont (SLM) is detected. - if (Family == 6 && - (Model == 28 || Model == 38 || Model == 39 || - Model == 53 || Model == 54)) { - X86ProcFamily = IntelAtom; - - UseLeaForSP = true; - ToggleFeature(X86::FeatureLeaForSP); - } - else if (Family == 6 && - (Model == 55 || Model == 74 || Model == 77)) { - X86ProcFamily = IntelSLM; - } - - unsigned MaxExtLevel; - X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); - - if (MaxExtLevel >= 0x80000001) { - X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); - if ((EDX >> 29) & 0x1) { - HasX86_64 = true; - ToggleFeature(X86::Feature64Bit); - } - if ((ECX >> 5) & 0x1) { - HasLZCNT = true; - ToggleFeature(X86::FeatureLZCNT); - } - if (IsIntel && ((ECX >> 8) & 0x1)) { - HasPRFCHW = true; - ToggleFeature(X86::FeaturePRFCHW); - } - if (IsAMD) { - if ((ECX >> 6) & 0x1) { - HasSSE4A = true; - ToggleFeature(X86::FeatureSSE4A); - } - if ((ECX >> 11) & 0x1) { - HasXOP = true; - ToggleFeature(X86::FeatureXOP); - } - if ((ECX >> 16) & 0x1) { - HasFMA4 = true; - ToggleFeature(X86::FeatureFMA4); - } - } - } - } - - if (MaxLevel >= 7) { - if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) { - if (IsIntel && (EBX & 0x1)) { - HasFSGSBase = true; - ToggleFeature(X86::FeatureFSGSBase); - } - if ((EBX >> 3) & 0x1) { - HasBMI = true; - ToggleFeature(X86::FeatureBMI); - } - if ((EBX >> 4) & 0x1) { - HasHLE = true; - ToggleFeature(X86::FeatureHLE); - } - if (IsIntel && ((EBX >> 5) & 0x1)) { - X86SSELevel = AVX2; - ToggleFeature(X86::FeatureAVX2); - } - if (IsIntel && ((EBX >> 8) & 0x1)) { - HasBMI2 = true; - ToggleFeature(X86::FeatureBMI2); - } - if (IsIntel && ((EBX >> 11) & 0x1)) { - HasRTM = true; - ToggleFeature(X86::FeatureRTM); - } - if (IsIntel && ((EBX >> 16) & 0x1)) { - X86SSELevel = AVX512F; - ToggleFeature(X86::FeatureAVX512); - } - if (IsIntel && ((EBX >> 18) & 0x1)) { - HasRDSEED = true; - ToggleFeature(X86::FeatureRDSEED); - } - if (IsIntel && ((EBX >> 19) & 0x1)) { - HasADX = true; - ToggleFeature(X86::FeatureADX); - } - if (IsIntel && ((EBX >> 26) & 0x1)) { - HasPFI = true; - ToggleFeature(X86::FeaturePFI); - } - if (IsIntel && ((EBX >> 27) & 0x1)) { - HasERI = true; - ToggleFeature(X86::FeatureERI); - } - if (IsIntel && ((EBX >> 28) & 0x1)) { - HasCDI = true; - ToggleFeature(X86::FeatureCDI); - } - if (IsIntel && ((EBX >> 29) & 0x1)) { - HasSHA = true; - ToggleFeature(X86::FeatureSHA); - } - } - if (IsAMD && ((ECX >> 21) & 0x1)) { - HasTBM = true; - ToggleFeature(X86::FeatureTBM); - } - } -} - void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) { AttributeSet FnAttrs = MF->getFunction()->getAttributes(); - Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, - "target-cpu"); - Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, - "target-features"); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); std::string CPU = - !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : ""; + !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : ""; std::string FS = - !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; + !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; if (!FS.empty()) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); @@ -426,54 +200,23 @@ void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) { void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { std::string CPUName = CPU; - if (!FS.empty() || !CPU.empty()) { - if (CPUName.empty()) { -#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ - || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - CPUName = sys::getHostCPUName(); -#else - CPUName = "generic"; -#endif - } - - // Make sure 64-bit features are available in 64-bit mode. (But make sure - // SSE2 can be turned off explicitly.) - std::string FullFS = FS; - if (In64BitMode) { - if (!FullFS.empty()) - FullFS = "+64bit,+sse2," + FullFS; - else - FullFS = "+64bit,+sse2"; - } - - // If feature string is not empty, parse features string. - ParseSubtargetFeatures(CPUName, FullFS); - } else { - if (CPUName.empty()) { -#if defined (__x86_64__) || defined(__i386__) - CPUName = sys::getHostCPUName(); -#else - CPUName = "generic"; -#endif - } - // Otherwise, use CPUID to auto-detect feature set. - AutoDetectSubtargetFeatures(); - - // Make sure 64-bit features are available in 64-bit mode. - if (In64BitMode) { - if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); } - if (!HasCMov) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } - - if (X86SSELevel < SSE2) { - X86SSELevel = SSE2; - ToggleFeature(X86::FeatureSSE1); - ToggleFeature(X86::FeatureSSE2); - } - } + if (CPUName.empty()) + CPUName = "generic"; + + // Make sure 64-bit features are available in 64-bit mode. (But make sure + // SSE2 can be turned off explicitly.) + std::string FullFS = FS; + if (In64BitMode) { + if (!FullFS.empty()) + FullFS = "+64bit,+sse2," + FullFS; + else + FullFS = "+64bit,+sse2"; } - // CPUName may have been set by the CPU detection code. Make sure the - // new MCSchedModel is used. + // If feature string is not empty, parse features string. + ParseSubtargetFeatures(CPUName, FullFS); + + // Make sure the right MCSchedModel is used. InitCPUSchedModel(CPUName); if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM) @@ -547,33 +290,36 @@ void X86Subtarget::initializeEnvironment() { PadShortFunctions = false; CallRegIndirect = false; LEAUsesAG = false; + SlowLEA = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; } X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, - unsigned StackAlignOverride) - : X86GenSubtargetInfo(TT, CPU, FS) - , X86ProcFamily(Others) - , PICStyle(PICStyles::None) - , TargetTriple(TT) - , StackAlignOverride(StackAlignOverride) - , In64BitMode(TargetTriple.getArch() == Triple::x86_64) - , In32BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() != Triple::CODE16) - , In16BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() == Triple::CODE16) { + const std::string &FS, unsigned StackAlignOverride) + : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), + PICStyle(PICStyles::None), TargetTriple(TT), + StackAlignOverride(StackAlignOverride), + In64BitMode(TargetTriple.getArch() == Triple::x86_64), + In32BitMode(TargetTriple.getArch() == Triple::x86 && + TargetTriple.getEnvironment() != Triple::CODE16), + In16BitMode(TargetTriple.getArch() == Triple::x86 && + TargetTriple.getEnvironment() == Triple::CODE16) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); } -bool X86Subtarget::enablePostRAScheduler( - CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const { +bool +X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode &Mode, + RegClassVector &CriticalPathRCs) const { Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; CriticalPathRCs.clear(); return PostRAScheduler && OptLevel >= CodeGenOpt::Default; } + +bool +X86Subtarget::enableEarlyIfConversion() const { + return hasCMov() && X86EarlyIfConv; +} diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 52986b9..703559a 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -178,6 +178,9 @@ protected: /// address generation (AG) time. bool LEAUsesAG; + /// SlowLEA - True if the LEA instruction with certain arguments is slow + bool SlowLEA; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -235,10 +238,6 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID - /// instruction. - void AutoDetectSubtargetFeatures(); - /// \brief Reset the features for the X86 target. void resetSubtargetFeatures(const MachineFunction *MF) override; private: @@ -319,11 +318,13 @@ public: bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } + bool slowLEA() const { return SlowLEA; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } bool isAtom() const { return X86ProcFamily == IntelAtom; } + bool isSLM() const { return X86ProcFamily == IntelSLM; } const Triple &getTargetTriple() const { return TargetTriple; } @@ -429,6 +430,8 @@ public: bool postRAScheduler() const { return PostRAScheduler; } + bool enableEarlyIfConversion() const override; + /// getInstrItins = Return the instruction itineraries based on the /// subtarget selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 6f09ccf..93760ef 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -108,6 +108,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = FloatABI::Hard; + // Windows stack unwinder gets confused when execution flow "falls through" + // after a call to 'noreturn' function. + // To prevent that, we emit a trap for 'unreachable' IR instructions. + // (which on X86, happens to be the 'ud2' instruction) + if (Subtarget.isTargetWin64()) + this->Options.TrapUnreachable = true; + initAsmInfo(); } @@ -119,12 +126,6 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, cl::desc("Minimize AVX to SSE transition penalty"), cl::init(true)); -// Temporary option to control early if-conversion for x86 while adding machine -// models. -static cl::opt<bool> -X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, - cl::desc("Enable early if-conversion on X86")); - //===----------------------------------------------------------------------===// // X86 Analysis Pass Setup //===----------------------------------------------------------------------===// @@ -177,19 +178,14 @@ bool X86PassConfig::addInstSelector() { if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) addPass(createCleanupLocalDynamicTLSPass()); - // For 32-bit, prepend instructions to set the "global base reg" for PIC. - if (!getX86Subtarget().is64Bit()) - addPass(createGlobalBaseRegPass()); + addPass(createX86GlobalBaseRegPass()); return false; } bool X86PassConfig::addILPOpts() { - if (X86EarlyIfConv && getX86Subtarget().hasCMov()) { - addPass(&EarlyIfConverterID); - return true; - } - return false; + addPass(&EarlyIfConverterID); + return true; } bool X86PassConfig::addPreRegAlloc() { @@ -208,18 +204,13 @@ bool X86PassConfig::addPreEmitPass() { ShouldPrint = true; } - if (getX86Subtarget().hasAVX() && UseVZeroUpper) { + if (UseVZeroUpper) { addPass(createX86IssueVZeroUpperPass()); ShouldPrint = true; } - if (getOptLevel() != CodeGenOpt::None && - getX86Subtarget().padShortFunctions()) { + if (getOptLevel() != CodeGenOpt::None) { addPass(createX86PadShortFunctions()); - ShouldPrint = true; - } - if (getOptLevel() != CodeGenOpt::None && - getX86Subtarget().LEAusesAG()){ addPass(createX86FixupLEAs()); ShouldPrint = true; } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 0a88e98..8157085 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -26,7 +26,7 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which // is an indirect pc-relative reference. - if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { + if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) { const MCSymbol *Sym = TM.getSymbol(GV, Mang); const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); @@ -62,7 +62,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( // operation. const SubOperator *Sub = dyn_cast<SubOperator>(CE); if (!Sub) - return 0; + return nullptr; // Symbols must first be numbers before we can subtract them, we need to see a // ptrtoint on both subtraction operands. @@ -71,13 +71,13 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( const PtrToIntOperator *SubRHS = dyn_cast<PtrToIntOperator>(Sub->getOperand(1)); if (!SubLHS || !SubRHS) - return 0; + return nullptr; // Our symbols should exist in address space zero, cowardly no-op if // otherwise. if (SubLHS->getPointerAddressSpace() != 0 || SubRHS->getPointerAddressSpace() != 0) - return 0; + return nullptr; // Both ptrtoint instructions must wrap global variables: // - Only global variables are eligible for image relative relocations. @@ -87,7 +87,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( const GlobalVariable *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); if (!GVLHS || !GVRHS) - return 0; + return nullptr; // We expect __ImageBase to be a global variable without a section, externally // defined. @@ -96,11 +96,11 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" || !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() || GVRHS->hasSection()) - return 0; + return nullptr; // An image-relative, thread-local, symbol makes no sense. if (GVLHS->isThreadLocal()) - return 0; + return nullptr; return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index c04964d..91b9d40 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -14,37 +14,24 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86tti" #include "X86.h" #include "X86TargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" using namespace llvm; +#define DEBUG_TYPE "x86tti" + // Declare the pass initialization routine locally as target-specific passes -// don't havve a target-wide initialization entry point, and so we rely on the +// don't have a target-wide initialization entry point, and so we rely on the // pass constructor initialization. namespace llvm { void initializeX86TTIPass(PassRegistry &); } -static cl::opt<bool> -UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), - cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); -static cl::opt<unsigned> -PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), - cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); -static cl::opt<unsigned> -PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), - cl::desc("Threshold for taken branches in X86 partial unrolling"), - cl::Hidden); - namespace { class X86TTI final : public ImmutablePass, public TargetTransformInfo { @@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { + X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); } @@ -87,8 +74,6 @@ public: /// \name Scalar TTI Implementations /// @{ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - void getUnrollingPreferences(Loop *L, - UnrollingPreferences &UP) const override; /// @} @@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } -void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { - if (!UsePartialUnrolling) - return; - // According to the Intel 64 and IA-32 Architectures Optimization Reference - // Manual, Intel Core models and later have a loop stream detector - // (and associated uop queue) that can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have no more than 4 (8 for Nehalem and later) branches - // taken, and none of them may be calls. - // - The loop can have no more than 18 (28 for Nehalem and later) uops. - - // According to the Software Optimization Guide for AMD Family 15h Processors, - // models 30h-4fh (Steamroller and later) have a loop predictor and loop - // buffer which can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have fewer than 16 branches - // - The loop must have less than 40 uops in all executed loop branches - - unsigned MaxBranches, MaxOps; - if (PartialUnrollingThreshold.getNumOccurrences() > 0) { - MaxBranches = PartialUnrollingMaxBranches; - MaxOps = PartialUnrollingThreshold; - } else if (ST->isAtom()) { - // On the Atom, the throughput for taken branches is 2 cycles. For small - // simple loops, expand by a small factor to hide the backedge cost. - MaxBranches = 2; - MaxOps = 10; - } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { - MaxBranches = 16; - MaxOps = 40; - } else if (ST->hasFMA4() /* Any other recent AMD */) { - return; - } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { - MaxBranches = 8; - MaxOps = 28; - } else if (ST->hasSSSE3() /* Intel Core */) { - MaxBranches = 4; - MaxOps = 18; - } else { - return; - } - - // Scan the loop: don't unroll loops with calls, and count the potential - // number of taken branches (this is somewhat conservative because we're - // counting all block transitions as potential branches while in reality some - // of these will become implicit via block placement). - unsigned MaxDepth = 0; - for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()), - DE = df_end(L->getHeader()); DI != DE;) { - if (!L->contains(*DI)) { - DI.skipChildren(); - continue; - } - - MaxDepth = std::max(MaxDepth, DI.getPathLength()); - if (MaxDepth > MaxBranches) - return; - - for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) - if (isa<CallInst>(I) || isa<InvokeInst>(I)) { - ImmutableCallSite CS(I); - if (const Function *F = CS.getCalledFunction()) { - if (!isLoweredToCall(F)) - continue; - } - - return; - } - - ++DI; - } - - // Enable runtime and partial unrolling up to the specified size. - UP.Partial = UP.Runtime = true; - UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; - - // Set the maximum count based on the loop depth. The maximum number of - // branches taken in a loop (including the backedge) is equal to the maximum - // loop depth (the DFS path length from the loop header to any block in the - // loop). When the loop is unrolled, this depth (except for the backedge - // itself) is multiplied by the unrolling factor. This new unrolled depth - // must be less than the target-specific maximum branch count (which limits - // the number of taken branches in the uop buffer). - if (MaxDepth > 1) - UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); -} - unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; @@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry<MVT::SimpleValueType> + AVX2UniformConstCostTable[] = { + { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence + { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence + { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence + { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasAVX2()) { + int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * AVX2UniformConstCostTable[Idx].Cost; + } + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. @@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); if (Idx != -1) return LT.first * SSE2UniformConstCostTable[Idx].Cost; @@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { if (BitSize == 0) return ~0U; + // Never hoist constants larger than 128bit, because this might lead to + // incorrect code generation or assertions in codegen. + // Fixme: Create a cost model for types larger than i128 once the codegen + // issues have been fixed. + if (BitSize > 128) + return TCC_Free; + if (Imm == 0) return TCC_Free; @@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return ~0U; + return TCC_Free; unsigned ImmIdx = ~0U; switch (Opcode) { @@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::ICmp: ImmIdx = 1; break; + // Always return TCC_Free for the shift value of a shift instruction. + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TCC_Free; + break; case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: @@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return ~0U; + return TCC_Free; switch (IID) { default: return TCC_Free; diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index d4341b9..0bb5f99 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" @@ -28,6 +27,8 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; +#define DEBUG_TYPE "x86-vzeroupper" + STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); namespace { @@ -246,7 +247,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { - if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512()) + const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + if (!ST.hasAVX() || ST.hasAVX512()) return false; TII = MF.getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); |