diff options
| author | Stephen Hines <srhines@google.com> | 2013-01-21 13:15:17 -0800 |
|---|---|---|
| committer | Stephen Hines <srhines@google.com> | 2013-01-21 13:15:17 -0800 |
| commit | 059800f9e3fee2852672f846d91a2da14da7783a (patch) | |
| tree | a6ef16b7263252ae1b8069295ea9cbbae0d9467d /lib/Target/X86 | |
| parent | cbefa15de4821975bb99fc6d74b3bdb42b2df45c (diff) | |
| parent | b6714227eda5d499f7667fc865f931126a8dc488 (diff) | |
| download | external_llvm-059800f9e3fee2852672f846d91a2da14da7783a.zip external_llvm-059800f9e3fee2852672f846d91a2da14da7783a.tar.gz external_llvm-059800f9e3fee2852672f846d91a2da14da7783a.tar.bz2 | |
Merge remote-tracking branch 'upstream/master' into merge-llvm
Conflicts:
lib/CodeGen/AsmPrinter/AsmPrinter.cpp
lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
lib/MC/MCAssembler.cpp
lib/Support/Atomic.cpp
lib/Support/Memory.cpp
lib/Target/ARM/ARMJITInfo.cpp
Change-Id: Ib339baf88df5b04870c8df1bedcfe1f877ccab8d
Diffstat (limited to 'lib/Target/X86')
70 files changed, 7378 insertions, 5326 deletions
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt index 47489bb..54204d4 100644 --- a/lib/Target/X86/AsmParser/CMakeLists.txt +++ b/lib/Target/X86/AsmParser/CMakeLists.txt @@ -1,7 +1,6 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) add_llvm_library(LLVMX86AsmParser - X86AsmLexer.cpp X86AsmParser.cpp ) diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp deleted file mode 100644 index 2794e60..0000000 --- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp +++ /dev/null @@ -1,160 +0,0 @@ -//===-- X86AsmLexer.cpp - Tokenize X86 assembly to AsmTokens --------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/X86BaseInfo.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCTargetAsmLexer.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/ADT/SmallVector.h" - -using namespace llvm; - -namespace { - -class X86AsmLexer : public MCTargetAsmLexer { - const MCAsmInfo &AsmInfo; - - bool tentativeIsValid; - AsmToken tentativeToken; - - const AsmToken &lexTentative() { - tentativeToken = getLexer()->Lex(); - tentativeIsValid = true; - return tentativeToken; - } - - const AsmToken &lexDefinite() { - if (tentativeIsValid) { - tentativeIsValid = false; - return tentativeToken; - } - return getLexer()->Lex(); - } - - AsmToken LexTokenATT(); - AsmToken LexTokenIntel(); -protected: - AsmToken LexToken() { - if (!Lexer) { - SetError(SMLoc(), "No MCAsmLexer installed"); - return AsmToken(AsmToken::Error, "", 0); - } - - switch (AsmInfo.getAssemblerDialect()) { - default: - SetError(SMLoc(), "Unhandled dialect"); - return AsmToken(AsmToken::Error, "", 0); - case 0: - return LexTokenATT(); - case 1: - return LexTokenIntel(); - } - } -public: - X86AsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI) - : MCTargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) { - } -}; - -} // end anonymous namespace - -#define GET_REGISTER_MATCHER -#include "X86GenAsmMatcher.inc" - -AsmToken X86AsmLexer::LexTokenATT() { - AsmToken lexedToken = lexDefinite(); - - switch (lexedToken.getKind()) { - default: - return lexedToken; - case AsmToken::Error: - SetError(Lexer->getErrLoc(), Lexer->getErr()); - return lexedToken; - - case AsmToken::Percent: { - const AsmToken &nextToken = lexTentative(); - if (nextToken.getKind() != AsmToken::Identifier) - return lexedToken; - - - if (unsigned regID = MatchRegisterName(nextToken.getString())) { - lexDefinite(); - - // FIXME: This is completely wrong when there is a space or other - // punctuation between the % and the register name. - StringRef regStr(lexedToken.getString().data(), - lexedToken.getString().size() + - nextToken.getString().size()); - - return AsmToken(AsmToken::Register, regStr, - static_cast<int64_t>(regID)); - } - - // Match register name failed. If this is "db[0-7]", match it as an alias - // for dr[0-7]. - if (nextToken.getString().size() == 3 && - nextToken.getString().startswith("db")) { - int RegNo = -1; - switch (nextToken.getString()[2]) { - case '0': RegNo = X86::DR0; break; - case '1': RegNo = X86::DR1; break; - case '2': RegNo = X86::DR2; break; - case '3': RegNo = X86::DR3; break; - case '4': RegNo = X86::DR4; break; - case '5': RegNo = X86::DR5; break; - case '6': RegNo = X86::DR6; break; - case '7': RegNo = X86::DR7; break; - } - - if (RegNo != -1) { - lexDefinite(); - - // FIXME: This is completely wrong when there is a space or other - // punctuation between the % and the register name. - StringRef regStr(lexedToken.getString().data(), - lexedToken.getString().size() + - nextToken.getString().size()); - return AsmToken(AsmToken::Register, regStr, - static_cast<int64_t>(RegNo)); - } - } - - - return lexedToken; - } - } -} - -AsmToken X86AsmLexer::LexTokenIntel() { - const AsmToken &lexedToken = lexDefinite(); - - switch(lexedToken.getKind()) { - default: - return lexedToken; - case AsmToken::Error: - SetError(Lexer->getErrLoc(), Lexer->getErr()); - return lexedToken; - case AsmToken::Identifier: { - unsigned regID = MatchRegisterName(lexedToken.getString().lower()); - - if (regID) - return AsmToken(AsmToken::Register, - lexedToken.getString(), - static_cast<int64_t>(regID)); - return lexedToken; - } - } -} - -extern "C" void LLVMInitializeX86AsmLexer() { - RegisterMCAsmLexer<X86AsmLexer> X(TheX86_32Target); - RegisterMCAsmLexer<X86AsmLexer> Y(TheX86_64Target); -} diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index c89e738..ca438eb 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -8,19 +8,21 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" -#include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/MC/MCStreamer.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -33,6 +35,7 @@ struct X86Operand; class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; MCAsmParser &Parser; + ParseInstructionInfo *InstInfo; private: MCAsmParser &getParser() const { return Parser; } @@ -40,8 +43,8 @@ private: bool Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(), - bool matchingInlineAsm = false) { - if (matchingInlineAsm) return true; + bool MatchingInlineAsm = false) { + if (MatchingInlineAsm) return true; return Parser.Error(L, Msg, Ranges); } @@ -53,32 +56,25 @@ private: X86Operand *ParseOperand(); X86Operand *ParseATTOperand(); X86Operand *ParseIntelOperand(); - X86Operand *ParseIntelMemOperand(); + X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc); + X86Operand *ParseIntelTypeOperator(SMLoc StartLoc); + X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc); X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size); X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp, + SmallString<64> &Err); + bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); bool processInstruction(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &Ops); - bool MatchAndEmitInstruction(SMLoc IDLoc, + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out); - - bool MatchInstruction(SMLoc IDLoc, unsigned &Kind, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - SmallVectorImpl<MCInst> &MCInsts, - unsigned &OrigErrorInfo, - bool matchingInlineAsm = false); - - unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands, - unsigned OperandNum, unsigned &NumMCOperands) { - return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, - NumMCOperands); - } + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm); /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode. @@ -107,14 +103,15 @@ private: public: X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) - : MCTargetAsmParser(), STI(sti), Parser(parser) { + : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); - virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); virtual bool ParseDirective(AsmToken DirectiveID); @@ -170,6 +167,7 @@ struct X86Operand : public MCParsedAsmOperand { } Kind; SMLoc StartLoc, EndLoc; + SMLoc OffsetOfLoc; union { struct { @@ -183,6 +181,7 @@ struct X86Operand : public MCParsedAsmOperand { struct { const MCExpr *Val; + bool NeedAsmRewrite; } Imm; struct { @@ -192,6 +191,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned IndexReg; unsigned Scale; unsigned Size; + bool NeedSizeDir; } Mem; }; @@ -202,8 +202,11 @@ struct X86Operand : public MCParsedAsmOperand { SMLoc getStartLoc() const { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const { return EndLoc; } - + /// getLocRange - Get the range between the first and last token of this + /// operand. SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + /// getOffsetOfLoc - Get the location of the offset operator. + SMLoc getOffsetOfLoc() const { return OffsetOfLoc; } virtual void print(raw_ostream &OS) const {} @@ -227,6 +230,11 @@ struct X86Operand : public MCParsedAsmOperand { return Imm.Val; } + bool needAsmRewrite() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.NeedAsmRewrite; + } + const MCExpr *getMemDisp() const { assert(Kind == Memory && "Invalid access!"); return Mem.Disp; @@ -323,6 +331,20 @@ struct X86Operand : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + unsigned getMemSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Size; + } + + bool isOffsetOf() const { + return OffsetOfLoc.getPointer(); + } + + bool needSizeDirective() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.NeedSizeDir; + } + bool isMem() const { return Kind == Memory; } bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); @@ -441,28 +463,32 @@ struct X86Operand : public MCParsedAsmOperand { } static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { - SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size() - 1); + SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); X86Operand *Res = new X86Operand(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; } - static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) { + static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, + SMLoc OffsetOfLoc = SMLoc()) { X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; + Res->OffsetOfLoc = OffsetOfLoc; return Res; } - static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ + static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc, + bool NeedRewrite = true){ X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; + Res->Imm.NeedAsmRewrite = NeedRewrite; return Res; } /// Create an absolute memory operand. - static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, - SMLoc EndLoc, unsigned Size = 0) { + static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, bool NeedSizeDir = false){ X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -470,6 +496,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; + Res->Mem.NeedSizeDir = NeedSizeDir; return Res; } @@ -477,7 +504,7 @@ struct X86Operand : public MCParsedAsmOperand { static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0) { + unsigned Size = 0, bool NeedSizeDir = false) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -492,6 +519,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; + Res->Mem.NeedSizeDir = NeedSizeDir; return Res; } }; @@ -530,10 +558,12 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, Parser.Lex(); // Eat percent token. const AsmToken &Tok = Parser.getTok(); + EndLoc = Tok.getEndLoc(); + if (Tok.isNot(AsmToken::Identifier)) { if (isParsingIntelSyntax()) return true; return Error(StartLoc, "invalid register name", - SMRange(StartLoc, Tok.getEndLoc())); + SMRange(StartLoc, EndLoc)); } RegNo = MatchRegisterName(Tok.getString()); @@ -554,13 +584,12 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, X86II::isX86_64ExtendedReg(RegNo)) return Error(StartLoc, "register %" + Tok.getString() + " is only available in 64-bit mode", - SMRange(StartLoc, Tok.getEndLoc())); + SMRange(StartLoc, EndLoc)); } // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) { RegNo = X86::ST0; - EndLoc = Tok.getLoc(); Parser.Lex(); // Eat 'st' // Check to see if we have '(4)' after %st. @@ -587,11 +616,13 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (getParser().Lex().isNot(AsmToken::RParen)) return Error(Parser.getTok().getLoc(), "expected ')'"); - EndLoc = Tok.getLoc(); + EndLoc = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat ')' return false; } + EndLoc = Parser.getTok().getEndLoc(); + // If this is "db[0-7]", match it as an alias // for dr[0-7]. if (RegNo == 0 && Tok.getString().size() == 3 && @@ -608,7 +639,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, } if (RegNo != 0) { - EndLoc = Tok.getLoc(); + EndLoc = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat it. return false; } @@ -617,10 +648,9 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == 0) { if (isParsingIntelSyntax()) return true; return Error(StartLoc, "invalid register name", - SMRange(StartLoc, Tok.getEndLoc())); + SMRange(StartLoc, EndLoc)); } - EndLoc = Tok.getEndLoc(); Parser.Lex(); // Eat identifier token. return false; } @@ -633,23 +663,25 @@ X86Operand *X86AsmParser::ParseOperand() { /// getIntelMemOperandSize - Return intel memory operand size. static unsigned getIntelMemOperandSize(StringRef OpStr) { - unsigned Size = 0; - if (OpStr == "BYTE") Size = 8; - if (OpStr == "WORD") Size = 16; - if (OpStr == "DWORD") Size = 32; - if (OpStr == "QWORD") Size = 64; - if (OpStr == "XWORD") Size = 80; - if (OpStr == "XMMWORD") Size = 128; - if (OpStr == "YMMWORD") Size = 256; + unsigned Size = StringSwitch<unsigned>(OpStr) + .Cases("BYTE", "byte", 8) + .Cases("WORD", "word", 16) + .Cases("DWORD", "dword", 32) + .Cases("QWORD", "qword", 64) + .Cases("XWORD", "xword", 80) + .Cases("XMMWORD", "xmmword", 128) + .Cases("YMMWORD", "ymmword", 256) + .Default(0); return Size; } -X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, +X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, unsigned Size) { unsigned BaseReg = 0, IndexReg = 0, Scale = 1; - SMLoc Start = Parser.getTok().getLoc(), End; + const AsmToken &Tok = Parser.getTok(); + SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc(); - const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); + const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ] // Eat '[' @@ -663,16 +695,18 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, // Handle '[' 'symbol' ']' if (getParser().ParseExpression(Disp, End)) return 0; if (getLexer().isNot(AsmToken::RBrac)) - return ErrorOperand(Start, "Expected ']' token!"); + return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!"); + End = Parser.getTok().getEndLoc(); Parser.Lex(); return X86Operand::CreateMem(Disp, Start, End, Size); } } else if (getLexer().is(AsmToken::Integer)) { - int64_t Val = Parser.getTok().getIntVal(); + int64_t Val = Tok.getIntVal(); Parser.Lex(); - SMLoc Loc = Parser.getTok().getLoc(); + SMLoc Loc = Tok.getLoc(); if (getLexer().is(AsmToken::RBrac)) { // Handle '[' number ']' + End = Parser.getTok().getEndLoc(); Parser.Lex(); const MCExpr *Disp = MCConstantExpr::Create(Val, getContext()); if (SegReg) @@ -682,7 +716,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, } else if (getLexer().is(AsmToken::Star)) { // Handle '[' Scale*IndexReg ']' Parser.Lex(); - SMLoc IdxRegLoc = Parser.getTok().getLoc(); + SMLoc IdxRegLoc = Tok.getLoc(); if (ParseRegister(IndexReg, IdxRegLoc, End)) return ErrorOperand(IdxRegLoc, "Expected register"); Scale = Val; @@ -690,16 +724,27 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, return ErrorOperand(Loc, "Unexpected token"); } - if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) { - bool isPlus = getLexer().is(AsmToken::Plus); + // Parse ][ as a plus. + bool ExpectRBrac = true; + if (getLexer().is(AsmToken::RBrac)) { + ExpectRBrac = false; + End = Parser.getTok().getEndLoc(); Parser.Lex(); - SMLoc PlusLoc = Parser.getTok().getLoc(); + } + + if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus) || + getLexer().is(AsmToken::LBrac)) { + ExpectRBrac = true; + bool isPlus = getLexer().is(AsmToken::Plus) || + getLexer().is(AsmToken::LBrac); + Parser.Lex(); + SMLoc PlusLoc = Tok.getLoc(); if (getLexer().is(AsmToken::Integer)) { - int64_t Val = Parser.getTok().getIntVal(); + int64_t Val = Tok.getIntVal(); Parser.Lex(); if (getLexer().is(AsmToken::Star)) { Parser.Lex(); - SMLoc IdxRegLoc = Parser.getTok().getLoc(); + SMLoc IdxRegLoc = Tok.getLoc(); if (ParseRegister(IndexReg, IdxRegLoc, End)) return ErrorOperand(IdxRegLoc, "Expected register"); Scale = Val; @@ -710,21 +755,47 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, return ErrorOperand(PlusLoc, "unexpected token after +"); } else if (getLexer().is(AsmToken::Identifier)) { // This could be an index register or a displacement expression. - End = Parser.getTok().getLoc(); if (!IndexReg) ParseRegister(IndexReg, Start, End); - else if (getParser().ParseExpression(Disp, End)) return 0; + else if (getParser().ParseExpression(Disp, End)) + return 0; } } + + // Parse ][ as a plus. + if (getLexer().is(AsmToken::RBrac)) { + ExpectRBrac = false; + End = Parser.getTok().getEndLoc(); + Parser.Lex(); + if (getLexer().is(AsmToken::LBrac)) { + ExpectRBrac = true; + Parser.Lex(); + if (getParser().ParseExpression(Disp, End)) + return 0; + } + } else if (ExpectRBrac) { + if (getParser().ParseExpression(Disp, End)) + return 0; + } - if (getLexer().isNot(AsmToken::RBrac)) - if (getParser().ParseExpression(Disp, End)) return 0; + if (ExpectRBrac) { + if (getLexer().isNot(AsmToken::RBrac)) + return ErrorOperand(End, "expected ']' token!"); + End = Parser.getTok().getEndLoc(); + Parser.Lex(); + } - End = Parser.getTok().getLoc(); - if (getLexer().isNot(AsmToken::RBrac)) - return ErrorOperand(End, "expected ']' token!"); - Parser.Lex(); - End = Parser.getTok().getLoc(); + // Parse the dot operator (e.g., [ebx].foo.bar). + if (Tok.getString().startswith(".")) { + SmallString<64> Err; + const MCExpr *NewDisp; + if (ParseIntelDotOperator(Disp, &NewDisp, Err)) + return ErrorOperand(Tok.getLoc(), Err); + + End = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the field. + Disp = NewDisp; + } // handle [-42] if (!BaseReg && !IndexReg) @@ -735,15 +806,15 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, } /// ParseIntelMemOperand - Parse intel style memory operand. -X86Operand *X86AsmParser::ParseIntelMemOperand() { +X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) { const AsmToken &Tok = Parser.getTok(); - SMLoc Start = Parser.getTok().getLoc(), End; - unsigned SegReg = 0; + SMLoc End; unsigned Size = getIntelMemOperandSize(Tok.getString()); if (Size) { Parser.Lex(); - assert (Tok.getString() == "PTR" && "Unexpected token!"); + assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") && + "Unexpected token!"); Parser.Lex(); } @@ -761,19 +832,166 @@ X86Operand *X86AsmParser::ParseIntelMemOperand() { } const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); - if (getParser().ParseExpression(Disp, End)) return 0; - return X86Operand::CreateMem(Disp, Start, End, Size); + if (getParser().ParseExpression(Disp, End)) + return 0; + + bool NeedSizeDir = false; + if (!Size && isParsingInlineAsm()) { + if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) { + const MCSymbol &Sym = SymRef->getSymbol(); + // FIXME: The SemaLookup will fail if the name is anything other then an + // identifier. + // FIXME: Pass a valid SMLoc. + SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size); + NeedSizeDir = Size > 0; + } + } + if (!isParsingInlineAsm()) + return X86Operand::CreateMem(Disp, Start, End, Size); + else + // When parsing inline assembly we set the base register to a non-zero value + // as we don't know the actual value at this time. This is necessary to + // get the matching correct in some cases. + return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0, + /*Scale*/1, Start, End, Size, NeedSizeDir); +} + +/// Parse the '.' operator. +bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, + const MCExpr **NewDisp, + SmallString<64> &Err) { + AsmToken Tok = *&Parser.getTok(); + uint64_t OrigDispVal, DotDispVal; + + // FIXME: Handle non-constant expressions. + if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) { + OrigDispVal = OrigDisp->getValue(); + } else { + Err = "Non-constant offsets are not supported!"; + return true; + } + + // Drop the '.'. + StringRef DotDispStr = Tok.getString().drop_front(1); + + // .Imm gets lexed as a real. + if (Tok.is(AsmToken::Real)) { + APInt DotDisp; + DotDispStr.getAsInteger(10, DotDisp); + DotDispVal = DotDisp.getZExtValue(); + } else if (Tok.is(AsmToken::Identifier)) { + // We should only see an identifier when parsing the original inline asm. + // The front-end should rewrite this in terms of immediates. + assert (isParsingInlineAsm() && "Unexpected field name!"); + + unsigned DotDisp; + std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); + if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, + DotDisp)) { + Err = "Unable to lookup field reference!"; + return true; + } + DotDispVal = DotDisp; + } else { + Err = "Unexpected token type!"; + return true; + } + + if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { + SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); + unsigned Len = DotDispStr.size(); + unsigned Val = OrigDispVal + DotDispVal; + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len, + Val)); + } + + *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext()); + return false; +} + +/// Parse the 'offset' operator. This operator is used to specify the +/// location rather then the content of a variable. +X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) { + SMLoc OffsetOfLoc = Start; + Parser.Lex(); // Eat offset. + Start = Parser.getTok().getLoc(); + assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier"); + + SMLoc End; + const MCExpr *Val; + if (getParser().ParseExpression(Val, End)) + return ErrorOperand(Start, "Unable to parse expression!"); + + // Don't emit the offset operator. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); + + // The offset operator will have an 'r' constraint, thus we need to create + // register operand to ensure proper matching. Just pick a GPR based on + // the size of a pointer. + unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; + return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc); +} + +/// Parse the 'TYPE' operator. The TYPE operator returns the size of a C or +/// C++ type or variable. If the variable is an array, TYPE returns the size of +/// a single element of the array. +X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) { + SMLoc TypeLoc = Start; + Parser.Lex(); // Eat offset. + Start = Parser.getTok().getLoc(); + assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier"); + + SMLoc End; + const MCExpr *Val; + if (getParser().ParseExpression(Val, End)) + return 0; + + unsigned Size = 0; + if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) { + const MCSymbol &Sym = SymRef->getSymbol(); + // FIXME: The SemaLookup will fail if the name is anything other then an + // identifier. + // FIXME: Pass a valid SMLoc. + if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size)) + return ErrorOperand(Start, "Unable to lookup TYPE of expr!"); + + Size /= 8; // Size is in terms of bits, but we want bytes in the context. + } + + // Rewrite the type operator and the C or C++ type or variable in terms of an + // immediate. E.g. TYPE foo -> $$4 + unsigned Len = End.getPointer() - TypeLoc.getPointer(); + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, Size)); + + const MCExpr *Imm = MCConstantExpr::Create(Size, getContext()); + return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false); } X86Operand *X86AsmParser::ParseIntelOperand() { SMLoc Start = Parser.getTok().getLoc(), End; + // offset operator. + StringRef AsmTokStr = Parser.getTok().getString(); + if ((AsmTokStr == "offset" || AsmTokStr == "OFFSET") && + isParsingInlineAsm()) + return ParseIntelOffsetOfOperator(Start); + + // Type directive. + if ((AsmTokStr == "type" || AsmTokStr == "TYPE") && + isParsingInlineAsm()) + return ParseIntelTypeOperator(Start); + + // Unsupported directives. + if (isParsingIntelSyntax() && + (AsmTokStr == "size" || AsmTokStr == "SIZE" || + AsmTokStr == "length" || AsmTokStr == "LENGTH")) + return ErrorOperand(Start, "Unsupported directive!"); + // immediate. if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) || getLexer().is(AsmToken::Minus)) { const MCExpr *Val; if (!getParser().ParseExpression(Val, End)) { - End = Parser.getTok().getLoc(); return X86Operand::CreateImm(Val, Start, End); } } @@ -781,12 +999,17 @@ X86Operand *X86AsmParser::ParseIntelOperand() { // register unsigned RegNo = 0; if (!ParseRegister(RegNo, Start, End)) { - End = Parser.getTok().getLoc(); - return X86Operand::CreateReg(RegNo, Start, End); + // If this is a segment register followed by a ':', then this is the start + // of a memory reference, otherwise this is a normal register reference. + if (getLexer().isNot(AsmToken::Colon)) + return X86Operand::CreateReg(RegNo, Start, End); + + getParser().Lex(); // Eat the colon. + return ParseIntelMemOperand(RegNo, Start); } // mem operand - return ParseIntelMemOperand(); + return ParseIntelMemOperand(0, Start); } X86Operand *X86AsmParser::ParseATTOperand() { @@ -956,7 +1179,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { Error(Parser.getTok().getLoc(), "unexpected token in memory operand"); return 0; } - SMLoc MemEnd = Parser.getTok().getLoc(); + SMLoc MemEnd = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat the ')'. // If we have both a base register and an index register make sure they are @@ -984,8 +1207,9 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { } bool X86AsmParser:: -ParseInstruction(StringRef Name, SMLoc NameLoc, +ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + InstInfo = &Info; StringRef PatchedName = Name; // FIXME: Hack to recognize setneb as setne. @@ -1520,27 +1744,12 @@ processInstruction(MCInst &Inst, } } +static const char *getSubtargetFeatureName(unsigned Val); bool X86AsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, +MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out) { - unsigned Kind; - unsigned ErrorInfo; - SmallVector<MCInst, 2> Insts; - - bool Error = MatchInstruction(IDLoc, Kind, Operands, Insts, - ErrorInfo); - if (!Error) - for (unsigned i = 0, e = Insts.size(); i != e; ++i) - Out.EmitInstruction(Insts[i]); - return Error; -} - -bool X86AsmParser:: -MatchInstruction(SMLoc IDLoc, unsigned &Kind, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo, - bool matchingInlineAsm) { + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); X86Operand *Op = static_cast<X86Operand*>(Operands[0]); assert(Op->isToken() && "Leading operand should always be a mnemonic!"); @@ -1557,7 +1766,8 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, MCInst Inst; Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); - MCInsts.push_back(Inst); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); const char *Repl = StringSwitch<const char*>(Op->getToken()) @@ -1579,23 +1789,38 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, MCInst Inst; // First, try a direct match. - switch (MatchInstructionImpl(Operands, Kind, Inst, OrigErrorInfo, + switch (MatchInstructionImpl(Operands, Inst, + ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())) { default: break; case Match_Success: // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. - while (processInstruction(Inst, Operands)) - ; + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; Inst.setLoc(IDLoc); - MCInsts.push_back(Inst); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); + Opcode = Inst.getOpcode(); return false; - case Match_MissingFeature: - Error(IDLoc, "instruction requires a CPU feature not currently enabled", - EmptyRanges, matchingInlineAsm); - return true; + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing. + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); + } case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; @@ -1626,20 +1851,32 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, // Check for the various suffix matches. Tmp[Base.size()] = Suffixes[0]; unsigned ErrorInfoIgnore; + unsigned ErrorInfoMissingFeature; unsigned Match1, Match2, Match3, Match4; - unsigned tKind; - Match1 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); - if (Match1 == Match_Success) Kind = tKind; + Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match1 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[1]; - Match2 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); - if (Match2 == Match_Success) Kind = tKind; + Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match2 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[2]; - Match3 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); - if (Match3 == Match_Success) Kind = tKind; + Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match3 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[3]; - Match4 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); - if (Match4 == Match_Success) Kind = tKind; + Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match4 == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; // Restore the old token. Op->setTokenValue(Base); @@ -1652,7 +1889,9 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, (Match3 == Match_Success) + (Match4 == Match_Success); if (NumSuccessfulMatches == 1) { Inst.setLoc(IDLoc); - MCInsts.push_back(Inst); + if (!MatchingInlineAsm) + Out.EmitInstruction(Inst); + Opcode = Inst.getOpcode(); return false; } @@ -1679,7 +1918,7 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, OS << "'" << Base << MatchChars[i] << "'"; } OS << ")"; - Error(IDLoc, OS.str(), EmptyRanges, matchingInlineAsm); + Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); return true; } @@ -1690,37 +1929,44 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { - ArrayRef<SMRange> Ranges = matchingInlineAsm ? EmptyRanges : + ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : Op->getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", - Ranges, matchingInlineAsm); + Ranges, MatchingInlineAsm); } // Recover location info for the operand if we know which was the problem. - if (OrigErrorInfo != ~0U) { - if (OrigErrorInfo >= Operands.size()) + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction", - EmptyRanges, matchingInlineAsm); + EmptyRanges, MatchingInlineAsm); - X86Operand *Operand = (X86Operand*)Operands[OrigErrorInfo]; + X86Operand *Operand = (X86Operand*)Operands[ErrorInfo]; if (Operand->getStartLoc().isValid()) { SMRange OperandRange = Operand->getLocRange(); return Error(Operand->getStartLoc(), "invalid operand for instruction", - OperandRange, matchingInlineAsm); + OperandRange, MatchingInlineAsm); } } return Error(IDLoc, "invalid operand for instruction", EmptyRanges, - matchingInlineAsm); + MatchingInlineAsm); } // If one instruction matched with a missing feature, report this as a // missing feature. if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) + (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){ - Error(IDLoc, "instruction requires a CPU feature not currently enabled", - EmptyRanges, matchingInlineAsm); - return true; + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) { + if (ErrorInfoMissingFeature & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); } // If one instruction matched with an invalid operand, report this as an @@ -1728,13 +1974,13 @@ MatchInstruction(SMLoc IDLoc, unsigned &Kind, if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) + (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){ Error(IDLoc, "invalid operand for instruction", EmptyRanges, - matchingInlineAsm); + MatchingInlineAsm); return true; } // If all of these were an outright failure, report it in a useless way. Error(IDLoc, "unknown use of instruction mnemonic without a size suffix", - EmptyRanges, matchingInlineAsm); + EmptyRanges, MatchingInlineAsm); return true; } @@ -1809,16 +2055,13 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { return false; } - -extern "C" void LLVMInitializeX86AsmLexer(); - // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target); RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target); - LLVMInitializeX86AsmLexer(); } #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION +#define GET_SUBTARGET_FEATURE_NAME #include "X86GenAsmMatcher.inc" diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index b886d46..d14899d 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -10,14 +10,12 @@ tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) -tablegen(LLVM X86GenEDInfo.inc -gen-enhanced-disassembly-info) add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp X86COFFMachineModuleInfo.cpp X86CodeEmitter.cpp - X86ELFWriterInfo.cpp X86FastISel.cpp X86FloatingPoint.cpp X86FrameLowering.cpp @@ -27,11 +25,13 @@ set(sources X86JITInfo.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp + X86PadShortFunction.cpp X86RegisterInfo.cpp X86SelectionDAGInfo.cpp X86Subtarget.cpp X86TargetMachine.cpp X86TargetObjectFile.cpp + X86TargetTransformInfo.cpp X86VZeroUpper.cpp ) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index f136927..ca6f80c 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -16,11 +16,9 @@ #include "X86Disassembler.h" #include "X86DisassemblerDecoder.h" - -#include "llvm/MC/EDInstInfo.h" -#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -33,7 +31,6 @@ #include "X86GenRegisterInfo.inc" #define GET_INSTRINFO_ENUM #include "X86GenInstrInfo.inc" -#include "X86GenEDInfo.inc" using namespace llvm; using namespace llvm::X86Disassembler; @@ -84,10 +81,6 @@ X86GenericDisassembler::~X86GenericDisassembler() { delete MII; } -const EDInstInfo *X86GenericDisassembler::getEDInfo() const { - return instInfoX86; -} - /// regionReader - a callback function that wraps the readByte method from /// MemoryObject. /// diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h index 0dbfa26..b92427a 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.h +++ b/lib/Target/X86/Disassembler/X86Disassembler.h @@ -78,7 +78,7 @@ uint16_t operands; #define INSTRUCTION_IDS \ - unsigned instructionIDs; + uint16_t instructionIDs; #include "X86DisassemblerDecoderCommon.h" @@ -95,8 +95,6 @@ class MCSubtargetInfo; class MemoryObject; class raw_ostream; -struct EDInstInfo; - namespace X86Disassembler { /// X86GenericDisassembler - Generic disassembler for all X86 platforms. @@ -122,8 +120,6 @@ public: raw_ostream &vStream, raw_ostream &cStream) const; - /// getEDInfo - See MCDisassembler. - const EDInstInfo *getEDInfo() const; private: DisassemblerMode fMode; }; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index af444d1..85d8a99 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -138,6 +138,10 @@ static InstrUID decode(OpcodeType type, if (modFromModRM(modRM) == 0x3) return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8]; return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; + case MODRM_SPLITMISC: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; case MODRM_FULL: return modRMTable[dec->instructionIDs+modRM]; } @@ -690,7 +694,7 @@ static int getIDWithAttrMask(uint16_t* instructionID, * @param orig - The instruction that is not 16-bit * @param equiv - The instruction that is 16-bit */ -static BOOL is16BitEquvalent(const char* orig, const char* equiv) { +static BOOL is16BitEquivalent(const char* orig, const char* equiv) { off_t i; for (i = 0;; i++) { @@ -856,7 +860,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { specWithOpSizeName = x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg); - if (is16BitEquvalent(specName, specWithOpSizeName)) { + if (is16BitEquivalent(specName, specWithOpSizeName)) { insn->instructionID = instructionIDWithOpsize; insn->spec = specifierForUID(instructionIDWithOpsize); } else { diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 05cbb4c..407ead3 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -24,7 +24,7 @@ extern "C" { uint16_t operands; #define INSTRUCTION_IDS \ - unsigned instructionIDs; + uint16_t instructionIDs; #include "X86DisassemblerDecoderCommon.h" diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index b0a0e1e..23dfe4b 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -160,6 +160,10 @@ typedef uint16_t InstrUID; * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode * corresponds to one instruction; otherwise, it corresponds to * a different instruction. + * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte + * divided by 8 is used to select instruction; otherwise, each + * value of the ModR/M byte could correspond to a different + * instruction. * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This corresponds to instructions that use reg field as opcode * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond @@ -169,6 +173,7 @@ typedef uint16_t InstrUID; #define MODRMTYPES \ ENUM_ENTRY(MODRM_ONEENTRY) \ ENUM_ENTRY(MODRM_SPLITRM) \ + ENUM_ENTRY(MODRM_SPLITMISC) \ ENUM_ENTRY(MODRM_SPLITREG) \ ENUM_ENTRY(MODRM_FULL) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 5118e4c..e357710 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -14,11 +14,12 @@ #define DEBUG_TYPE "asm-printer" #include "X86ATTInstPrinter.h" -#include "X86InstComments.h" +#include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" -#include "llvm/MC/MCInst.h" +#include "X86InstComments.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" @@ -33,11 +34,19 @@ using namespace llvm; void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << '%' << getRegisterName(RegNo); + OS << markup("<reg:") + << '%' << getRegisterName(RegNo) + << markup(">"); } void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::LOCK) + OS << "\tlock\n"; + // Try to print any aliases first. if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); @@ -52,7 +61,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O) { - switch (MI->getOperand(Op).getImm()) { + int64_t Imm = MI->getOperand(Op).getImm() & 0xf; + switch (Imm) { default: llvm_unreachable("Invalid ssecc argument!"); case 0: O << "eq"; break; case 1: O << "lt"; break; @@ -70,6 +80,30 @@ void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, case 0xd: O << "ge"; break; case 0xe: O << "gt"; break; case 0xf: O << "true"; break; + } +} + +void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; + switch (Imm) { + default: llvm_unreachable("Invalid avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; case 0x10: O << "eq_os"; break; case 0x11: O << "lt_oq"; break; case 0x12: O << "le_oq"; break; @@ -89,15 +123,15 @@ void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, } } -/// print_pcrel_imm - This is used to print an immediate value that ends up +/// printPCRelImm - This is used to print an immediate value that ends up /// being encoded as a pc-relative value (e.g. for jumps and calls). These /// print slightly differently than normal immediates. For example, a $ is not /// emitted. -void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { +void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - O << Op.getImm(); + O << formatImm(Op.getImm()); else { assert(Op.isExpr() && "unknown pcrel immediate operand"); // If a symbolic branch target was added as a constant expression then print @@ -119,17 +153,21 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - O << '%' << getRegisterName(Op.getReg()); + printRegName(O, Op.getReg()); } else if (Op.isImm()) { // Print X86 immediates as signed values. - O << '$' << (int64_t)Op.getImm(); + O << markup("<imm:") + << '$' << formatImm((int64_t)Op.getImm()) + << markup(">"); if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << '$' << *Op.getExpr(); + O << markup("<imm:") + << '$' << *Op.getExpr() + << markup(">"); } } @@ -140,6 +178,8 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &DispSpec = MI->getOperand(Op+3); const MCOperand &SegReg = MI->getOperand(Op+4); + O << markup("<mem:"); + // If this has a segment register, print it. if (SegReg.getReg()) { printOperand(MI, Op+4, O); @@ -149,7 +189,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, if (DispSpec.isImm()) { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) - O << DispVal; + O << formatImm(DispVal); } else { assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); O << *DispSpec.getExpr(); @@ -164,9 +204,15 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << ','; printOperand(MI, Op+2, O); unsigned ScaleVal = MI->getOperand(Op+1).getImm(); - if (ScaleVal != 1) - O << ',' << ScaleVal; + if (ScaleVal != 1) { + O << ',' + << markup("<imm:") + << ScaleVal // never printed in hex. + << markup(">"); + } } O << ')'; } + + O << markup(">"); } diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 2e00bff..8e09183 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -40,7 +40,8 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 4ea662c..141f4a4 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -1,4 +1,4 @@ -//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===// +//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// // // The LLVM Compiler Infrastructure // @@ -7,17 +7,18 @@ // //===----------------------------------------------------------------------===// // -// This file includes code for rendering MCInst instances as AT&T-style +// This file includes code for rendering MCInst instances as Intel-style // assembly. // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "asm-printer" #include "X86IntelInstPrinter.h" -#include "X86InstComments.h" +#include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" -#include "llvm/MC/MCInst.h" +#include "X86InstComments.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" @@ -32,6 +33,12 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::LOCK) + OS << "\tlock\n"; + printInstruction(MI, OS); // Next always print the annotation. @@ -44,7 +51,8 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O) { - switch (MI->getOperand(Op).getImm()) { + int64_t Imm = MI->getOperand(Op).getImm() & 0xf; + switch (Imm) { default: llvm_unreachable("Invalid ssecc argument!"); case 0: O << "eq"; break; case 1: O << "lt"; break; @@ -62,6 +70,30 @@ void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, case 0xd: O << "ge"; break; case 0xe: O << "gt"; break; case 0xf: O << "true"; break; + } +} + +void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; + switch (Imm) { + default: llvm_unreachable("Invalid avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; case 0x10: O << "eq_os"; break; case 0x11: O << "lt_oq"; break; case 0x12: O << "le_oq"; break; @@ -78,14 +110,13 @@ void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, case 0x1d: O << "ge_oq"; break; case 0x1e: O << "gt_oq"; break; case 0x1f: O << "true_us"; break; - } } -/// print_pcrel_imm - This is used to print an immediate value that ends up +/// printPCRelImm - This is used to print an immediate value that ends up /// being encoded as a pc-relative value. -void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { +void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) O << Op.getImm(); @@ -153,8 +184,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, printOperand(MI, Op+2, O); NeedPlus = true; } - - + if (!DispSpec.isImm()) { if (NeedPlus) O << " + "; assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 4f5938d..bb769eb 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This class prints an X86 MCInst to intel style .s file syntax. +// This class prints an X86 MCInst to Intel style .s file syntax. // //===----------------------------------------------------------------------===// @@ -37,7 +37,8 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); - void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "OPAQUE PTR "; diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 32e40fe..acc90ec 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -66,9 +66,10 @@ public: }; class X86AsmBackend : public MCAsmBackend { + StringRef CPU; public: - X86AsmBackend(const Target &T) - : MCAsmBackend() {} + X86AsmBackend(const Target &T, StringRef _CPU) + : MCAsmBackend(), CPU(_CPU) {} unsigned getNumFixupKinds() const { return X86::NumTargetFixupKinds; @@ -112,7 +113,7 @@ public: bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCInstFragment *DF, + const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const; void relaxInstruction(const MCInst &Inst, MCInst &Res) const; @@ -254,7 +255,7 @@ bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCInstFragment *DF, + const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const { // Relax if the value is too big for a (signed) i8. return int64_t(Value) != int64_t(int8_t(Value)); @@ -278,9 +279,9 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { Res.setOpcode(RelaxedOp); } -/// writeNopData - Write optimal nops to the output file for the \arg Count -/// bytes. This returns the number of bytes written. It may return 0 if -/// the \arg Count is more than the maximum optimal nops. +/// \brief Write a sequence of optimal nops to the output, covering \p Count +/// bytes. +/// \return - true on success, false on failure bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { static const uint8_t Nops[10][10] = { // nop @@ -305,6 +306,15 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, }; + // This CPU doesnt support long nops. If needed add more. + // FIXME: Can we get this from the subtarget somehow? + if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" || + CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") { + for (uint64_t i = 0; i < Count; ++i) + OW->Write8(0x90); + return true; + } + // Write an optimal sequence for the first 15 bytes. const uint64_t OptimalCount = (Count < 16) ? Count : 15; const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10; @@ -327,8 +337,8 @@ namespace { class ELFX86AsmBackend : public X86AsmBackend { public: uint8_t OSABI; - ELFX86AsmBackend(const Target &T, uint8_t _OSABI) - : X86AsmBackend(T), OSABI(_OSABI) { + ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(_OSABI) { HasReliableSymbolDifference = true; } @@ -340,21 +350,21 @@ public: class ELFX86_32AsmBackend : public ELFX86AsmBackend { public: - ELFX86_32AsmBackend(const Target &T, uint8_t OSABI) - : ELFX86AsmBackend(T, OSABI) {} + ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI); + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386); } }; class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: - ELFX86_64AsmBackend(const Target &T, uint8_t OSABI) - : ELFX86AsmBackend(T, OSABI) {} + ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI); + return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64); } }; @@ -362,8 +372,8 @@ class WindowsX86AsmBackend : public X86AsmBackend { bool Is64Bit; public: - WindowsX86AsmBackend(const Target &T, bool is64Bit) - : X86AsmBackend(T) + WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU) + : X86AsmBackend(T, CPU) , Is64Bit(is64Bit) { } @@ -374,14 +384,14 @@ public: class DarwinX86AsmBackend : public X86AsmBackend { public: - DarwinX86AsmBackend(const Target &T) - : X86AsmBackend(T) { } + DarwinX86AsmBackend(const Target &T, StringRef CPU) + : X86AsmBackend(T, CPU) { } }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { public: - DarwinX86_32AsmBackend(const Target &T) - : DarwinX86AsmBackend(T) {} + DarwinX86_32AsmBackend(const Target &T, StringRef CPU) + : DarwinX86AsmBackend(T, CPU) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, @@ -392,8 +402,8 @@ public: class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { public: - DarwinX86_64AsmBackend(const Target &T) - : DarwinX86AsmBackend(T) { + DarwinX86_64AsmBackend(const Target &T, StringRef CPU) + : DarwinX86AsmBackend(T, CPU) { HasReliableSymbolDifference = true; } @@ -439,28 +449,28 @@ public: } // end anonymous namespace -MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) { +MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) { Triple TheTriple(TT); if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) - return new DarwinX86_32AsmBackend(T); + return new DarwinX86_32AsmBackend(T, CPU); - if (TheTriple.isOSWindows()) - return new WindowsX86AsmBackend(T, false); + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) + return new WindowsX86AsmBackend(T, false, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); - return new ELFX86_32AsmBackend(T, OSABI); + return new ELFX86_32AsmBackend(T, OSABI, CPU); } -MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) { +MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) { Triple TheTriple(TT); if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) - return new DarwinX86_64AsmBackend(T); + return new DarwinX86_64AsmBackend(T, CPU); - if (TheTriple.isOSWindows()) - return new WindowsX86AsmBackend(T, true); + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) + return new WindowsX86AsmBackend(T, true, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); - return new ELFX86_64AsmBackend(T, OSABI); + return new ELFX86_64AsmBackend(T, OSABI, CPU); } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index db597fb..7ea1961 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -276,9 +276,9 @@ namespace X86II { MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40, MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46, - MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, - MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, - MRM_DF = 55, + MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50, + MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54, + MRM_DE = 55, MRM_DF = 56, /// RawFrmImm8 - This is used for the ENTER instruction, which has two /// immediates, the first of which is a 16-bit immediate (specified by @@ -580,11 +580,11 @@ namespace X86II { case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: - case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: + case X86II::MRM_D4: case X86II::MRM_D5: + case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: + case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: return -1; } } diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 5a42a80..de80dd8 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -20,7 +20,7 @@ using namespace llvm; namespace { class X86ELFObjectWriter : public MCELFObjectTargetWriter { public: - X86ELFObjectWriter(bool is64Bit, uint8_t OSABI); + X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); virtual ~X86ELFObjectWriter(); protected: @@ -30,10 +30,11 @@ namespace { }; } -X86ELFObjectWriter::X86ELFObjectWriter(bool Is64Bit, uint8_t OSABI) - : MCELFObjectTargetWriter(Is64Bit, OSABI, - Is64Bit ? ELF::EM_X86_64 : ELF::EM_386, - /*HasRelocationAddend*/ Is64Bit) {} +X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, + uint16_t EMachine) + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 uses Rel instead of RelA. + /*HasRelocationAddend*/ EMachine != ELF::EM_386) {} X86ELFObjectWriter::~X86ELFObjectWriter() {} @@ -48,7 +49,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); unsigned Type; - if (is64Bit()) { + if (getEMachine() == ELF::EM_X86_64) { if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); @@ -130,7 +131,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, case FK_Data_1: Type = ELF::R_X86_64_8; break; } } - } else { + } else if (getEMachine() == ELF::EM_386) { if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); @@ -210,15 +211,17 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, case FK_Data_1: Type = ELF::R_386_8; break; } } - } + } else + llvm_unreachable("Unsupported ELF machine type."); return Type; } MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, - bool Is64Bit, - uint8_t OSABI) { + bool IsELF64, + uint8_t OSABI, + uint16_t EMachine) { MCELFObjectTargetWriter *MOTW = - new X86ELFObjectWriter(Is64Bit, OSABI); + new X86ELFObjectWriter(IsELF64, OSABI, EMachine); return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index b0acd7d..16488eb 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -34,6 +34,10 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), clEnumValN(Intel, "intel", "Emit Intel-style assembly"), clEnumValEnd)); +static cl::opt<bool> +MarkedJTDataRegions("mark-data-regions", cl::init(false), + cl::desc("Mark code section jump table data regions."), + cl::Hidden); void X86MCAsmInfoDarwin::anchor() { } @@ -59,6 +63,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { SupportsDebugInformation = true; DwarfUsesInlineInfoSection = true; + UseDataRegionDirectives = MarkedJTDataRegions; // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 4a38324..122204a 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -28,8 +29,8 @@ using namespace llvm; namespace { class X86MCCodeEmitter : public MCCodeEmitter { - X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT - void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT + X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; + void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; const MCInstrInfo &MCII; const MCSubtargetInfo &STI; MCContext &Ctx; @@ -51,8 +52,8 @@ public: return (STI.getFeatureBits() & X86::Mode64Bit) == 0; } - static unsigned GetX86RegNum(const MCOperand &MO) { - return X86_MC::getX86RegNum(MO.getReg()); + unsigned GetX86RegNum(const MCOperand &MO) const { + return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7; } // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range @@ -64,8 +65,8 @@ public: // VEX.VVVV => XMM9 => ~9 // // See table 4-35 of Intel AVX Programming Reference for details. - static unsigned char getVEXRegisterEncoding(const MCInst &MI, - unsigned OpNum) { + unsigned char getVEXRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { unsigned SrcReg = MI.getOperand(OpNum).getReg(); unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum)); if (X86II::isX86_64ExtendedReg(SrcReg)) @@ -560,15 +561,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } - // Set the vector length to 256-bit if YMM0-YMM15 is used - for (unsigned i = 0; i != MI.getNumOperands(); ++i) { - if (!MI.getOperand(i).isReg()) - continue; - unsigned SrcReg = MI.getOperand(i).getReg(); - if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15) - VEX_L = 1; - } - // Classify VEX_B, VEX_4V, VEX_R, VEX_X unsigned NumOps = Desc.getNumOperands(); unsigned CurOp = 0; @@ -1129,13 +1121,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: - case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E8: - case X86II::MRM_F0: case X86II::MRM_F8: - case X86II::MRM_F9: + case X86II::MRM_D4: case X86II::MRM_D5: + case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: + case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E8: case X86II::MRM_F0: + case X86II::MRM_F8: case X86II::MRM_F9: EmitByte(BaseOpcode, CurByte, OS); unsigned char MRM; @@ -1150,6 +1142,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_D0: MRM = 0xD0; break; case X86II::MRM_D1: MRM = 0xD1; break; case X86II::MRM_D4: MRM = 0xD4; break; + case X86II::MRM_D5: MRM = 0xD5; break; case X86II::MRM_D8: MRM = 0xD8; break; case X86II::MRM_D9: MRM = 0xD9; break; case X86II::MRM_DA: MRM = 0xDA; break; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 3482363..5e84530 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -12,19 +12,19 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" -#include "X86MCAsmInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "InstPrinter/X86IntelInstPrinter.h" -#include "llvm/MC/MachineLocation.h" +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/ADT/Triple.h" -#include "llvm/Support/Host.h" +#include "llvm/MC/MachineLocation.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" #include "llvm/Support/TargetRegistry.h" #define GET_REGINFO_MC_DESC @@ -209,117 +209,10 @@ unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) { return DWARFFlavour::X86_32_Generic; } -/// getX86RegNum - This function maps LLVM register identifiers to their X86 -/// specific numbering, which is used in various places encoding instructions. -unsigned X86_MC::getX86RegNum(unsigned RegNo) { - switch(RegNo) { - case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX; - case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX; - case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX; - case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX; - case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH: - return N86::ESP; - case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH: - return N86::EBP; - case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH: - return N86::ESI; - case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH: - return N86::EDI; - - case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: - return N86::EAX; - case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: - return N86::ECX; - case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: - return N86::EDX; - case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: - return N86::EBX; - case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: - return N86::ESP; - case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: - return N86::EBP; - case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: - return N86::ESI; - case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: - return N86::EDI; - - case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3: - case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7: - return RegNo-X86::ST0; - - case X86::XMM0: case X86::XMM8: - case X86::YMM0: case X86::YMM8: case X86::MM0: - return 0; - case X86::XMM1: case X86::XMM9: - case X86::YMM1: case X86::YMM9: case X86::MM1: - return 1; - case X86::XMM2: case X86::XMM10: - case X86::YMM2: case X86::YMM10: case X86::MM2: - return 2; - case X86::XMM3: case X86::XMM11: - case X86::YMM3: case X86::YMM11: case X86::MM3: - return 3; - case X86::XMM4: case X86::XMM12: - case X86::YMM4: case X86::YMM12: case X86::MM4: - return 4; - case X86::XMM5: case X86::XMM13: - case X86::YMM5: case X86::YMM13: case X86::MM5: - return 5; - case X86::XMM6: case X86::XMM14: - case X86::YMM6: case X86::YMM14: case X86::MM6: - return 6; - case X86::XMM7: case X86::XMM15: - case X86::YMM7: case X86::YMM15: case X86::MM7: - return 7; - - case X86::ES: return 0; - case X86::CS: return 1; - case X86::SS: return 2; - case X86::DS: return 3; - case X86::FS: return 4; - case X86::GS: return 5; - - case X86::CR0: case X86::CR8 : case X86::DR0: return 0; - case X86::CR1: case X86::CR9 : case X86::DR1: return 1; - case X86::CR2: case X86::CR10: case X86::DR2: return 2; - case X86::CR3: case X86::CR11: case X86::DR3: return 3; - case X86::CR4: case X86::CR12: case X86::DR4: return 4; - case X86::CR5: case X86::CR13: case X86::DR5: return 5; - case X86::CR6: case X86::CR14: case X86::DR6: return 6; - case X86::CR7: case X86::CR15: case X86::DR7: return 7; - - // Pseudo index registers are equivalent to a "none" - // scaled index (See Intel Manual 2A, table 2-3) - case X86::EIZ: - case X86::RIZ: - return 4; - - default: - assert((int(RegNo) > 0) && "Unknown physical register!"); - return 0; - } -} - void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) { // FIXME: TableGen these. for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) { - int SEH = X86_MC::getX86RegNum(Reg); - switch (Reg) { - case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: - case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: - case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: - case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: - case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: - case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: - case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: - case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: - case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: - case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: - case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: - case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: - SEH += 8; - break; - } + unsigned SEH = MRI->getEncodingValue(Reg); MRI->mapLLVMRegToSEHReg(Reg, SEH); } } @@ -364,7 +257,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false), - X86_MC::getDwarfRegFlavour(TT, true)); + X86_MC::getDwarfRegFlavour(TT, true), + RA); X86_MC::InitLLVM2SEHRegisterMapping(X); return X; } @@ -379,11 +273,15 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { MAI = new X86_64MCAsmInfoDarwin(TheTriple); else MAI = new X86MCAsmInfoDarwin(TheTriple); + } else if (TheTriple.getEnvironment() == Triple::ELF) { + // Force the use of an ELF container. + MAI = new X86ELFMCAsmInfo(TheTriple); } else if (TheTriple.getOS() == Triple::Win32) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) { MAI = new X86MCAsmInfoGNUCOFF(TheTriple); } else { + // The default is ELF. MAI = new X86ELFMCAsmInfo(TheTriple); } @@ -465,7 +363,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - if (TheTriple.isOSWindows()) + if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll); return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 4650069..981aa1a 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -64,8 +64,6 @@ namespace X86_MC { unsigned getDwarfRegFlavour(StringRef TT, bool isEH); - unsigned getX86RegNum(unsigned RegNo); - void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance. @@ -80,8 +78,8 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCSubtargetInfo &STI, MCContext &Ctx); -MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT); -MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT); +MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU); +MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU); /// createX86MachObjectWriter - Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, @@ -91,8 +89,9 @@ MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, /// createX86ELFObjectWriter - Construct an X86 ELF object writer. MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, - bool Is64Bit, - uint8_t OSABI); + bool IsELF64, + uint8_t OSABI, + uint16_t EMachine); /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); } // End llvm namespace diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index f0f1982..64f005c 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -7,23 +7,25 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86FixupKinds.h" #include "MCTargetDesc/X86MCTargetDesc.h" -#include "llvm/MC/MCAssembler.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCValue.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Object/MachOFormat.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" using namespace llvm; using namespace llvm::object; namespace { class X86MachObjectWriter : public MCMachObjectTargetWriter { - void RecordScatteredRelocation(MachObjectWriter *Writer, + bool RecordScatteredRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, @@ -335,7 +337,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, Writer->addRelocation(Fragment->getParent(), MRE); } -void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, +bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, @@ -381,6 +383,19 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, // Relocations are written out in reverse order, so the PAIR comes first. if (Type == macho::RIT_Difference || Type == macho::RIT_Generic_LocalDifference) { + // If the offset is too large to fit in a scattered relocation, + // we're hosed. It's an unfortunate limitation of the MachO format. + if (FixupOffset > 0xffffff) { + char Buffer[32]; + format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); + Asm.getContext().FatalError(Fixup.getLoc(), + Twine("Section too large, can't encode " + "r_address (") + Buffer + + ") into 24 bits of scattered " + "relocation entry."); + llvm_unreachable("fatal error returned?!"); + } + macho::RelocationEntry MRE; MRE.Word0 = ((0 << 0) | (macho::RIT_Pair << 24) | @@ -389,6 +404,16 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, macho::RF_Scattered); MRE.Word1 = Value2; Writer->addRelocation(Fragment->getParent(), MRE); + } else { + // If the offset is more than 24-bits, it won't fit in a scattered + // relocation offset field, so we fall back to using a non-scattered + // relocation. This is a bit risky, as if the offset reaches out of + // the block and the linker is doing scattered loading on this + // symbol, things can go badly. + // + // Required for 'as' compatibility. + if (FixupOffset > 0xffffff) + return false; } macho::RelocationEntry MRE; @@ -399,6 +424,7 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, macho::RF_Scattered); MRE.Word1 = Value; Writer->addRelocation(Fragment->getParent(), MRE); + return true; } void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, @@ -469,9 +495,11 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // If this is a difference or a defined symbol plus an offset, then we need a // scattered relocation entry. Differences always require scattered // relocations. - if (Target.getSymB()) - return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + if (Target.getSymB()) { + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + return; + } // Get the symbol data, if any. MCSymbolData *SD = 0; @@ -483,9 +511,13 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, uint32_t Offset = Target.getConstant(); if (IsPCRel) Offset += 1 << Log2Size; - if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD)) - return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + // Try to record the scattered relocation if needed. Fall back to non + // scattered if necessary (see comments in RecordScatteredRelocation() + // for details). + if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) && + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue)) + return; // See <reloc.h>. uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile index 949661e..e518fec 100644 --- a/lib/Target/X86/Makefile +++ b/lib/Target/X86/Makefile @@ -16,8 +16,7 @@ BUILT_SOURCES = X86GenRegisterInfo.inc X86GenInstrInfo.inc \ X86GenAsmWriter.inc X86GenAsmMatcher.inc \ X86GenAsmWriter1.inc X86GenDAGISel.inc \ X86GenDisassemblerTables.inc X86GenFastISel.inc \ - X86GenCallingConv.inc X86GenSubtargetInfo.inc \ - X86GenEDInfo.inc + X86GenCallingConv.inc X86GenSubtargetInfo.inc DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 6a8a4fd..b4285a0 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -1568,43 +1568,6 @@ The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" //===---------------------------------------------------------------------===// - -Testcase: -int a(int x) { return (x & 127) > 31; } - -Current output: - movl 4(%esp), %eax - andl $127, %eax - cmpl $31, %eax - seta %al - movzbl %al, %eax - ret - -Ideal output: - xorl %eax, %eax - testl $96, 4(%esp) - setne %al - ret - -This should definitely be done in instcombine, canonicalizing the range -condition into a != condition. We get this IR: - -define i32 @a(i32 %x) nounwind readnone { -entry: - %0 = and i32 %x, 127 ; <i32> [#uses=1] - %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1] - %2 = zext i1 %1 to i32 ; <i32> [#uses=1] - ret i32 %2 -} - -Instcombine prefers to strength reduce relational comparisons to equality -comparisons when possible, this should be another case of that. This could -be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it -looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already -be redesigned to use ComputeMaskedBits and friends. - - -//===---------------------------------------------------------------------===// Testcase: int x(int a) { return (a&0xf0)>>4; } diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 52a67f7..815d235 100644 --- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "llvm/Module.h" +#include "llvm/IR/Module.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index dce5b4d..1f9919f 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -63,11 +63,12 @@ FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, /// FunctionPass *createEmitX86CodeToMemory(); -/// createX86MaxStackAlignmentHeuristicPass - This function returns a pass -/// which determines whether the frame pointer register should be -/// reserved in case dynamic stack alignment is later required. -/// -FunctionPass *createX86MaxStackAlignmentHeuristicPass(); +/// \brief Creates an X86-specific Target Transformation Info pass. +ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); + +/// createX86PadShortFunctions - Return a pass that pads short functions +/// with NOOPs. This will prevent a stall when returning on the Atom. +FunctionPass *createX86PadShortFunctions(); } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index d078a7b..3ab2899 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -118,11 +118,16 @@ def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", "Support BMI instructions">; def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", "Support BMI2 instructions">; +def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", + "Support RTM instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", "HasSlowDivide", "true", "Use small divide for positive values less than 256">; +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", + "PadShortFunctions", "true", + "Pad short functions">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -153,7 +158,8 @@ def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem]>; def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, @@ -162,9 +168,9 @@ def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B, +def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, - FeatureSlowDivide]>; + FeatureSlowDivide, FeaturePadShortFunctions]>; // "Arrandale" along with corei3 and corei5 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureFastUAMem, @@ -180,19 +186,20 @@ def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B, // Sandy Bridge // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. -def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL]>; +def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; // Ivy Bridge -def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, +def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>; // Haswell -def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase, +def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, + FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA]>; + FeatureBMI2, FeatureFMA, + FeatureRTM]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -231,6 +238,7 @@ def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, FeatureFMA]>; +def : Proc<"geode", [Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; def : Proc<"winchip2", [Feature3DNow]>; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index a4785c9..5b3e0ba 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -13,34 +13,33 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" -#include "X86MCInstLower.h" +#include "InstPrinter/X86ATTInstPrinter.h" #include "X86.h" #include "X86COFFMachineModuleInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" -#include "InstPrinter/X86ATTInstPrinter.h" -#include "llvm/CallingConv.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/Type.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineModuleInfoImpls.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/Mangler.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/Support/COFF.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; //===----------------------------------------------------------------------===// @@ -206,10 +205,10 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, } } -/// print_pcrel_imm - This is used to print an immediate value that ends up +/// printPCRelImm - This is used to print an immediate value that ends up /// being encoded as a pc-relative value. These print slightly differently, for /// example, a $ is not emitted. -void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, +void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); switch (MO.getType()) { @@ -243,7 +242,7 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - EVT VT = (strcmp(Modifier+6,"64") == 0) ? + MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ? MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); Reg = getX86SubSuperRegister(Reg, VT); @@ -267,46 +266,6 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, } } -void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op, - raw_ostream &O) { - unsigned char value = MI->getOperand(Op).getImm(); - switch (value) { - default: llvm_unreachable("Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - case 0x10: O << "eq_os"; break; - case 0x11: O << "lt_oq"; break; - case 0x12: O << "le_oq"; break; - case 0x13: O << "unord_s"; break; - case 0x14: O << "neq_us"; break; - case 0x15: O << "nlt_uq"; break; - case 0x16: O << "nle_uq"; break; - case 0x17: O << "ord_s"; break; - case 0x18: O << "eq_us"; break; - case 0x19: O << "nge_uq"; break; - case 0x1a: O << "ngt_uq"; break; - case 0x1b: O << "false_os"; break; - case 0x1c: O << "neq_os"; break; - case 0x1d: O << "ge_oq"; break; - case 0x1e: O << "gt_oq"; break; - case 0x1f: O << "true_us"; break; - } -} - void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, const char *Modifier) { const MachineOperand &BaseReg = MI->getOperand(Op); @@ -365,10 +324,51 @@ void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, printLeaMemReference(MI, Op, O, Modifier); } -void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op, - raw_ostream &O) { - O << *MF->getPICBaseSymbol() << '\n'; - O << *MF->getPICBaseSymbol() << ':'; +void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier, + unsigned AsmVariant){ + const MachineOperand &BaseReg = MI->getOperand(Op); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + const MachineOperand &IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + const MachineOperand &SegReg = MI->getOperand(Op+4); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O, Modifier, AsmVariant); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op, O, Modifier, AsmVariant); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+2, O, Modifier, AsmVariant); + NeedPlus = true; + } + + assert (DispSpec.isImm() && "Displacement is not an immediate!"); + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << DispVal; + } + O << ']'; } bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, @@ -459,7 +459,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return false; case 'P': // This is the operand of a call, treat specially. - print_pcrel_imm(MI, OpNo, O); + printPCRelImm(MI, OpNo, O); return false; case 'n': // Negate the immediate or print a '-' before the operand. @@ -481,6 +481,11 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { + if (AsmVariant) { + printIntelMemReference(MI, OpNo, O); + return false; + } + if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. @@ -682,7 +687,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const TargetData *TD = TM.getTargetData(); + const DataLayout *TD = TM.getDataLayout(); for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer.EmitLabel(Stubs[i].first); diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 0062387..61eb14e 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -34,47 +34,48 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { Subtarget = &TM.getSubtarget<X86Subtarget>(); } - virtual const char *getPassName() const { + virtual const char *getPassName() const LLVM_OVERRIDE { return "X86 AT&T-Style Assembly Printer"; } const X86Subtarget &getSubtarget() const { return *Subtarget; } - virtual void EmitStartOfAsmFile(Module &M); + virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE; - virtual void EmitEndOfAsmFile(Module &M); + virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE; - virtual void EmitInstruction(const MachineInstr *MI); + virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE; void printSymbolOperand(const MachineOperand &MO, raw_ostream &O); // These methods are used by the tablegen'erated instruction printer. void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, const char *Modifier = 0, unsigned AsmVariant = 0); - void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS); - bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS); - - void printMachineInstruction(const MachineInstr *MI); - void printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O); + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) LLVM_OVERRIDE; + virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) LLVM_OVERRIDE; + void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, const char *Modifier=NULL); void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, const char *Modifier=NULL); - void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O); + void printIntelMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier=NULL, + unsigned AsmVariant = 1); - bool runOnMachineFunction(MachineFunction &F); + virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE; void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + virtual MachineLocation + getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; }; } // end namespace llvm diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h index 471eb31..0dfeb42 100644 --- a/lib/Target/X86/X86COFFMachineModuleInfo.h +++ b/lib/Target/X86/X86COFFMachineModuleInfo.h @@ -15,12 +15,12 @@ #define X86COFF_MACHINEMODULEINFO_H #include "X86MachineFunctionInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineModuleInfo.h" namespace llvm { class X86MachineFunctionInfo; - class TargetData; + class DataLayout; /// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation /// for X86 COFF targets. diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index a6d2709..7ad2fdd 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -88,6 +88,30 @@ def RetCC_X86_32_Fast : CallingConv<[ CCDelegateTo<RetCC_X86Common> ]>; +// Intel_OCL_BI return-value convention. +def RetCC_Intel_OCL_BI : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + // No more than 4 registers + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // i32, i64 in the standard way + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 HiPE return-value convention. +def RetCC_X86_32_HiPE : CallingConv<[ + // Promote all types to i32 + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> +]>; + // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. @@ -108,17 +132,30 @@ def RetCC_X86_Win64_C : CallingConv<[ CCDelegateTo<RetCC_X86_64_C> ]>; +// X86-64 HiPE return-value convention. +def RetCC_X86_64_HiPE : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> +]>; // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // If HiPE, use RetCC_X86_32_HiPE. + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + // Otherwise, use RetCC_X86_32_C. CCDelegateTo<RetCC_X86_32_C> ]>; // This is the root return-value convention for the X86-64 backend. def RetCC_X86_64 : CallingConv<[ + // HiPE uses RetCC_X86_64_HiPE + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -128,6 +165,10 @@ def RetCC_X86_64 : CallingConv<[ // This is the return-value convention used for the entire X86 backend. def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, CCDelegateTo<RetCC_X86_32> ]>; @@ -249,6 +290,18 @@ def CC_X86_64_GHC : CallingConv<[ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> ]>; +def CC_X86_64_HiPE : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + //===----------------------------------------------------------------------===// // X86 C Calling Convention //===----------------------------------------------------------------------===// @@ -324,7 +377,7 @@ def CC_X86_32_FastCall : CallingConv<[ CCIfNest<CCAssignToReg<[EAX]>>, // The first 2 integer arguments are passed in ECX/EDX - CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>, // Otherwise, same as everything else. CCDelegateTo<CC_X86_32_Common> @@ -380,6 +433,42 @@ def CC_X86_32_GHC : CallingConv<[ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> ]>; +def CC_X86_32_HiPE : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>> +]>; + +// X86-64 Intel OpenCL built-ins calling convention. +def CC_Intel_OCL_BI : CallingConv<[ + + CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, + CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, + + CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>, + CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, + CCDelegateTo<CC_X86_32_C> +]>; + //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// @@ -390,6 +479,7 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, // Otherwise, drop to normal X86-32 CC CCDelegateTo<CC_X86_32_C> @@ -398,6 +488,7 @@ def CC_X86_32 : CallingConv<[ // This is the root argument convention for the X86-64 backend. def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -408,6 +499,7 @@ def CC_X86_64 : CallingConv<[ // This is the argument convention used for the entire X86 backend. def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, CCDelegateTo<CC_X86_32> ]>; @@ -426,3 +518,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; + + +// Standard C + YMM6-15 +def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, + R13, R14, R15, + (sequence "YMM%u", 6, 15))>; + +//Standard C + XMM 8-15 +def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, + (sequence "XMM%u", 8, 15))>; + +//Standard C + YMM 8-15 +def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, + (sequence "YMM%u", 8, 15))>; diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index e202321..bc77334 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -13,23 +13,23 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "x86-emitter" +#include "X86.h" #include "X86InstrInfo.h" #include "X86JITInfo.h" +#include "X86Relocations.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "X86Relocations.h" -#include "X86.h" -#include "llvm/LLVMContext.h" -#include "llvm/PassManager.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/JITCodeEmitter.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/PassManager.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -42,7 +42,7 @@ namespace { template<class CodeEmitter> class Emitter : public MachineFunctionPass { const X86InstrInfo *II; - const TargetData *TD; + const DataLayout *TD; X86TargetMachine &TM; CodeEmitter &MCE; MachineModuleInfo *MMI; @@ -56,7 +56,7 @@ namespace { MCE(mce), PICBaseOffset(0), Is64BitMode(false), IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} Emitter(X86TargetMachine &tm, CodeEmitter &mce, - const X86InstrInfo &ii, const TargetData &td, bool is64) + const X86InstrInfo &ii, const DataLayout &td, bool is64) : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), MCE(mce), PICBaseOffset(0), Is64BitMode(is64), IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} @@ -109,6 +109,14 @@ namespace { void emitMemModRMByte(const MachineInstr &MI, unsigned Op, unsigned RegOpcodeField, intptr_t PCAdj = 0); + + unsigned getX86RegNum(unsigned RegNo) const { + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + return TRI->getEncodingValue(RegNo) & 0x7; + } + + unsigned char getVEXRegisterEncoding(const MachineInstr &MI, + unsigned OpNum) const; }; template<class CodeEmitter> @@ -128,7 +136,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { MCE.setModuleInfo(MMI); II = TM.getInstrInfo(); - TD = TM.getTargetData(); + TD = TM.getDataLayout(); Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); IsPIC = TM.getRelocationModel() == Reloc::PIC_; @@ -363,7 +371,7 @@ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, template<class CodeEmitter> void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeFld){ - MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg))); + MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg))); } template<class CodeEmitter> @@ -501,7 +509,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, // 2-7) and absolute references. unsigned BaseRegNo = -1U; if (BaseReg != 0 && BaseReg != X86::RIP) - BaseRegNo = X86_MC::getX86RegNum(BaseReg); + BaseRegNo = getX86RegNum(BaseReg); if (// The SIB byte must be used if there is an index register. IndexReg.getReg() == 0 && @@ -577,15 +585,15 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, // Manual 2A, table 2-7. The displacement has already been output. unsigned IndexRegNo; if (IndexReg.getReg()) - IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg()); + IndexRegNo = getX86RegNum(IndexReg.getReg()); else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) IndexRegNo = 4; emitSIBByte(SS, IndexRegNo, 5); } else { - unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg); + unsigned BaseRegNo = getX86RegNum(BaseReg); unsigned IndexRegNo; if (IndexReg.getReg()) - IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg()); + IndexRegNo = getX86RegNum(IndexReg.getReg()); else IndexRegNo = 4; // For example [ESP+1*<noreg>+4] emitSIBByte(SS, IndexRegNo, BaseRegNo); @@ -756,10 +764,12 @@ void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags, // VEX.VVVV => XMM9 => ~9 // // See table 4-35 of Intel AVX Programming Reference for details. -static unsigned char getVEXRegisterEncoding(const MachineInstr &MI, - unsigned OpNum) { +template<class CodeEmitter> +unsigned char +Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI, + unsigned OpNum) const { unsigned SrcReg = MI.getOperand(OpNum).getReg(); - unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg()); + unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg()); if (X86II::isX86_64ExtendedReg(SrcReg)) SrcRegNum |= 8; @@ -921,17 +931,6 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, } - // Set the vector length to 256-bit if YMM0-YMM15 is used - for (unsigned i = 0; i != MI.getNumOperands(); ++i) { - if (!MI.getOperand(i).isReg()) - continue; - if (MI.getOperand(i).isImplicit()) - continue; - unsigned SrcReg = MI.getOperand(i).getReg(); - if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15) - VEX_L = 1; - } - // Classify VEX_B, VEX_4V, VEX_R, VEX_X unsigned NumOps = Desc->getNumOperands(); unsigned CurOp = 0; @@ -1246,7 +1245,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::AddRegFrm: { MCE.emitByte(BaseOpcode + - X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg())); + getX86RegNum(MI.getOperand(CurOp++).getReg())); if (CurOp == NumOps) break; @@ -1281,7 +1280,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::MRMDestReg: { MCE.emitByte(BaseOpcode); emitRegModRMByte(MI.getOperand(CurOp).getReg(), - X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg())); + getX86RegNum(MI.getOperand(CurOp+1).getReg())); CurOp += 2; break; } @@ -1292,7 +1291,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) SrcRegNum++; emitMemModRMByte(MI, CurOp, - X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg())); + getX86RegNum(MI.getOperand(SrcRegNum).getReg())); CurOp = SrcRegNum + 1; break; } @@ -1308,7 +1307,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, ++SrcRegNum; emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(), - X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); + getX86RegNum(MI.getOperand(CurOp).getReg())); // 2 operands skipped with HasMemOp4, compensate accordingly CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; if (HasVEX_4VOp3) @@ -1330,7 +1329,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? X86II::getSizeOfImm(Desc->TSFlags) : 0; emitMemModRMByte(MI, FirstMemOp, - X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); + getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); CurOp += AddrOperands + 1; if (HasVEX_4VOp3) ++CurOp; @@ -1420,7 +1419,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, MCE.emitByte(BaseOpcode); // Duplicate register, used by things like MOV8r0 (aka xor reg,reg). emitRegModRMByte(MI.getOperand(CurOp).getReg(), - X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); + getX86RegNum(MI.getOperand(CurOp).getReg())); ++CurOp; break; @@ -1453,7 +1452,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand : CurOp); ++CurOp; - unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4; + unsigned RegNum = getX86RegNum(MO.getReg()) << 4; if (X86II::isX86_64ExtendedReg(MO.getReg())) RegNum |= 1 << 7; // If there is an additional 5th operand it must be an immediate, which diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp deleted file mode 100644 index c1a49a7..0000000 --- a/lib/Target/X86/X86ELFWriterInfo.cpp +++ /dev/null @@ -1,147 +0,0 @@ -//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements ELF writer information for the X86 backend. -// -//===----------------------------------------------------------------------===// - -#include "X86ELFWriterInfo.h" -#include "X86Relocations.h" -#include "llvm/Function.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Implementation of the X86ELFWriterInfo class -//===----------------------------------------------------------------------===// - -X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_) - : TargetELFWriterInfo(is64Bit_, isLittleEndian_) { - EMachine = is64Bit ? EM_X86_64 : EM_386; - } - -X86ELFWriterInfo::~X86ELFWriterInfo() {} - -unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { - if (is64Bit) { - switch(MachineRelTy) { - case X86::reloc_pcrel_word: - return ELF::R_X86_64_PC32; - case X86::reloc_absolute_word: - return ELF::R_X86_64_32; - case X86::reloc_absolute_word_sext: - return ELF::R_X86_64_32S; - case X86::reloc_absolute_dword: - return ELF::R_X86_64_64; - case X86::reloc_picrel_word: - default: - llvm_unreachable("unknown x86_64 machine relocation type"); - } - } else { - switch(MachineRelTy) { - case X86::reloc_pcrel_word: - return ELF::R_386_PC32; - case X86::reloc_absolute_word: - return ELF::R_386_32; - case X86::reloc_absolute_word_sext: - case X86::reloc_absolute_dword: - case X86::reloc_picrel_word: - default: - llvm_unreachable("unknown x86 machine relocation type"); - } - } -} - -long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, - long int Modifier) const { - if (is64Bit) { - switch(RelTy) { - case ELF::R_X86_64_PC32: return Modifier - 4; - case ELF::R_X86_64_32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_64: - return Modifier; - default: - llvm_unreachable("unknown x86_64 relocation type"); - } - } else { - switch(RelTy) { - case ELF::R_386_PC32: return Modifier - 4; - case ELF::R_386_32: return Modifier; - default: - llvm_unreachable("unknown x86 relocation type"); - } - } -} - -unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { - if (is64Bit) { - switch(RelTy) { - case ELF::R_X86_64_PC32: - case ELF::R_X86_64_32: - case ELF::R_X86_64_32S: - return 32; - case ELF::R_X86_64_64: - return 64; - default: - llvm_unreachable("unknown x86_64 relocation type"); - } - } else { - switch(RelTy) { - case ELF::R_386_PC32: - case ELF::R_386_32: - return 32; - default: - llvm_unreachable("unknown x86 relocation type"); - } - } -} - -bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { - if (is64Bit) { - switch(RelTy) { - case ELF::R_X86_64_PC32: - return true; - case ELF::R_X86_64_32: - case ELF::R_X86_64_32S: - case ELF::R_X86_64_64: - return false; - default: - llvm_unreachable("unknown x86_64 relocation type"); - } - } else { - switch(RelTy) { - case ELF::R_386_PC32: - return true; - case ELF::R_386_32: - return false; - default: - llvm_unreachable("unknown x86 relocation type"); - } - } -} - -unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const { - return is64Bit ? - X86::reloc_absolute_dword : X86::reloc_absolute_word; -} - -long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset, - unsigned RelOffset, - unsigned RelTy) const { - - if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32) - return SymOffset - (RelOffset + 4); - - llvm_unreachable("computeRelocation unknown for this relocation type"); -} diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h deleted file mode 100644 index a45b5bb..0000000 --- a/lib/Target/X86/X86ELFWriterInfo.h +++ /dev/null @@ -1,59 +0,0 @@ -//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements ELF writer information for the X86 backend. -// -//===----------------------------------------------------------------------===// - -#ifndef X86_ELF_WRITER_INFO_H -#define X86_ELF_WRITER_INFO_H - -#include "llvm/Target/TargetELFWriterInfo.h" - -namespace llvm { - - class X86ELFWriterInfo : public TargetELFWriterInfo { - - public: - X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_); - virtual ~X86ELFWriterInfo(); - - /// getRelocationType - Returns the target specific ELF Relocation type. - /// 'MachineRelTy' contains the object code independent relocation type - virtual unsigned getRelocationType(unsigned MachineRelTy) const; - - /// hasRelocationAddend - True if the target uses an addend in the - /// ELF relocation entry. - virtual bool hasRelocationAddend() const { return is64Bit ? true : false; } - - /// getDefaultAddendForRelTy - Gets the default addend value for a - /// relocation entry based on the target ELF relocation type. - virtual long int getDefaultAddendForRelTy(unsigned RelTy, - long int Modifier = 0) const; - - /// getRelTySize - Returns the size of relocatable field in bits - virtual unsigned getRelocationTySize(unsigned RelTy) const; - - /// isPCRelativeRel - True if the relocation type is pc relative - virtual bool isPCRelativeRel(unsigned RelTy) const; - - /// getJumpTableRelocationTy - Returns the machine relocation type used - /// to reference a jumptable. - virtual unsigned getAbsoluteLabelMachineRelTy() const; - - /// computeRelocation - Some relocatable fields could be relocated - /// directly, avoiding the relocation symbol emission, compute the - /// final relocation value for this symbol. - virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset, - unsigned RelTy) const; - }; - -} // end llvm namespace - -#endif // X86_ELF_WRITER_INFO_H diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 54704d8..5facb7b 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -14,24 +14,24 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86ISelLowering.h" +#include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/CallingConv.h" -#include "llvm/DerivedTypes.h" -#include "llvm/GlobalVariable.h" -#include "llvm/GlobalAlias.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" @@ -45,9 +45,9 @@ class X86FastISel : public FastISel { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// StackPtr - Register used as the stack pointer. + /// RegInfo - X86 register info. /// - unsigned StackPtr; + const X86RegisterInfo *RegInfo; /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. @@ -61,9 +61,9 @@ public: const TargetLibraryInfo *libInfo) : FastISel(funcInfo, libInfo) { Subtarget = &TM.getSubtarget<X86Subtarget>(); - StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); + RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); } virtual bool TargetSelectInstruction(const Instruction *I); @@ -297,7 +297,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, case MVT::i32: Opc = X86::MOV32mi; break; case MVT::i64: // Must be a 32-bit sign extended value. - if ((int)CI->getSExtValue() == CI->getSExtValue()) + if (isInt<32>(CI->getSExtValue())) Opc = X86::MOV64mi32; break; } @@ -710,6 +710,8 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { bool X86FastISel::X86SelectRet(const Instruction *I) { const ReturnInst *Ret = cast<ReturnInst>(I); const Function &F = *I->getParent()->getParent(); + const X86MachineFunctionInfo *X86MFInfo = + FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); if (!FuncInfo.CanLowerReturn) return false; @@ -724,8 +726,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { return false; // Don't handle popping bytes on return for now. - if (FuncInfo.MF->getInfo<X86MachineFunctionInfo>() - ->getBytesToPopOnReturn() != 0) + if (X86MFInfo->getBytesToPopOnReturn() != 0) return 0; // fastcc with -tailcallopt is intended to provide a guaranteed @@ -739,8 +740,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(), - Outs, TLI); + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -809,6 +809,19 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { MRI.addLiveOut(VA.getLocReg()); } + // The x86-64 ABI for returning structs by value requires that we copy + // the sret argument into %rax for the return. We saved the argument into + // a virtual register in the entry block, so now we copy the value out + // and into %rax. + if (Subtarget->is64Bit() && F.hasStructRetAttr()) { + unsigned Reg = X86MFInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()!"); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + X86::RAX).addReg(Reg); + MRI.addLiveOut(X86::RAX); + } + // Now emit the RET. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET)); return true; @@ -1515,6 +1528,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) return X86VisitIntrinsicCall(*II); + // Allow SelectionDAG isel to handle tail calls. + if (cast<CallInst>(I)->isTailCall()) + return false; + return DoSelectCall(I, 0); } @@ -1567,8 +1584,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Check whether the function can return without sret-demotion. SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), - Outs, TLI); + GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI); bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), *FuncInfo.MF, FTy->isVarArg(), Outs, FTy->getContext()); @@ -1771,7 +1787,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { } else { unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; - AM.Base.Reg = StackPtr; + AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; @@ -1891,11 +1907,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { ComputeValueVTs(TLI, I->getType(), RetTys); for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { EVT VT = RetTys[i]; - EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); for (unsigned j = 0; j != NumRegs; ++j) { ISD::InputArg MyFlags; - MyFlags.VT = RegisterVT.getSimpleVT(); + MyFlags.VT = RegisterVT; MyFlags.Used = !CS.getInstruction()->use_empty(); if (CS.paramHasAttr(0, Attribute::SExt)) MyFlags.Flags.setSExt(); @@ -2140,13 +2156,13 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { MVT VT; if (!isTypeLegal(CF->getType(), VT)) - return false; + return 0; // Get opcode and regclass for the given zero. unsigned Opc = 0; const TargetRegisterClass *RC = NULL; switch (VT.SimpleTy) { - default: return false; + default: return 0; case MVT::f32: if (X86ScalarSSEf32) { Opc = X86::FsFLD0SS; @@ -2167,7 +2183,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { break; case MVT::f80: // No f80 support yet. - return false; + return 0; } unsigned ResultReg = createResultReg(RC); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 9d5de81..0585b43 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -26,17 +26,17 @@ #define DEBUG_TYPE "x86-codegen" #include "X86.h" #include "X86InstrInfo.h" -#include "llvm/InlineAsm.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -111,7 +111,7 @@ namespace { EdgeBundles *Bundles; // Return a bitmask of FP registers in block's live-in list. - unsigned calcLiveInMask(MachineBasicBlock *MBB) { + static unsigned calcLiveInMask(MachineBasicBlock *MBB) { unsigned Mask = 0; for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), E = MBB->livein_end(); I != E; ++I) { @@ -171,7 +171,7 @@ namespace { // Shuffle live registers to match the expectations of successor blocks. void finishBlockStack(); -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dumpStack() const { dbgs() << "Stack contents:"; for (unsigned i = 0; i != StackTop; ++i) { @@ -198,7 +198,7 @@ namespace { } /// getScratchReg - Return an FP register that is not currently in use. - unsigned getScratchReg() { + unsigned getScratchReg() const { for (int i = NumFPRegs - 1; i >= 8; --i) if (!isLive(i)) return i; @@ -206,7 +206,7 @@ namespace { } /// isScratchReg - Returns trus if RegNo is a scratch FP register. - bool isScratchReg(unsigned RegNo) { + static bool isScratchReg(unsigned RegNo) { return RegNo > 8 && RegNo < NumFPRegs; } @@ -311,7 +311,7 @@ namespace { void handleSpecialFP(MachineBasicBlock::iterator &I); // Check if a COPY instruction is using FP registers. - bool isFPCopy(MachineInstr *MI) { + static bool isFPCopy(MachineInstr *MI) { unsigned DstReg = MI->getOperand(0).getReg(); unsigned SrcReg = MI->getOperand(1).getReg(); @@ -577,8 +577,8 @@ namespace { friend bool operator<(const TableEntry &TE, unsigned V) { return TE.from < V; } - friend bool LLVM_ATTRIBUTE_USED operator<(unsigned V, - const TableEntry &TE) { + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V, + const TableEntry &TE) { return V < TE.from; } }; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 2238688..420aeb8 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -17,18 +17,18 @@ #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/Function.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -313,11 +313,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, if (CSI.empty()) return; std::vector<MachineMove> &Moves = MMI.getFrameMoves(); - const TargetData *TD = TM.getTargetData(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD->getPointerSize(); + int stackGrowth = -RegInfo->getSlotSize(); // FIXME: This is dirty hack. The code itself is pretty mess right now. // It should be rewritten from scratch and generalized sometimes. @@ -625,6 +625,22 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { return CompactUnwindEncoding; } +/// usesTheStack - This function checks if any of the users of EFLAGS +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has +/// to use the stack, and if we don't adjust the stack we clobber the first +/// frame index. +/// See X86InstrInfo::copyPhysReg. +static bool usesTheStack(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS), + re = MRI.reg_end(); ri != re; ++ri) + if (ri->isCopy()) + return true; + + return false; +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -673,12 +689,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the - // stack pointer (we fit in the Red Zone). - if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) && + // stack pointer (we fit in the Red Zone). We also check that we don't + // push and pop from the stack. + if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; @@ -715,9 +734,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // ELSE => DW_CFA_offset_extended std::vector<MachineMove> &Moves = MMI.getFrameMoves(); - const TargetData *TD = MF.getTarget().getTargetData(); uint64_t NumBytes = 0; - int stackGrowth = -TD->getPointerSize(); + int stackGrowth = -SlotSize; if (HasFP) { // Calculate required stack adjustment. @@ -836,8 +854,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MI->getOperand(3).setIsDead(); } - DL = MBB.findDebugLoc(MBBI); - // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. @@ -1141,7 +1157,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } MachineInstr *NewMI = prior(MBBI); - NewMI->copyImplicitOps(MBBI); + NewMI->copyImplicitOps(MF, MBBI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 5fdc61e..935f9bd 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -19,24 +19,21 @@ #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Type.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Support/CFG.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); @@ -100,7 +97,7 @@ namespace { Base_Reg = Reg; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump() { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; @@ -191,7 +188,6 @@ namespace { SDNode *Select(SDNode *N); SDNode *SelectGather(SDNode *N, unsigned Opc); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); - SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); @@ -246,13 +242,15 @@ namespace { else if (AM.CP) Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp, AM.SymbolFlags); - else if (AM.ES) + else if (AM.ES) { + assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); - else if (AM.JT != -1) + } else if (AM.JT != -1) { + assert(!AM.Disp && "Non-zero displacement is ignored with JT."); Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); - else if (AM.BlockAddr) - Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32, - true, AM.SymbolFlags); + } else if (AM.BlockAddr) + Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, + AM.SymbolFlags); else Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32); @@ -361,7 +359,7 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { /// MoveBelowCallOrigChain - Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, - SDValue Call, SDValue OrigChain) { + SDValue Call, SDValue OrigChain) { SmallVector<SDValue, 8> Ops; SDValue Chain = OrigChain.getOperand(0); if (Chain.getNode() == Load.getNode()) @@ -385,11 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size()); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); + + unsigned NumOps = Call.getNode()->getNumOperands(); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); - for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i) + for (unsigned i = 1, e = NumOps; i != e; ++i) Ops.push_back(Call.getOperand(i)); - CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size()); + CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps); } /// isCalleeLoad - Return true if call address is a load and it can be @@ -398,6 +398,10 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, /// In the case of a tail call, there isn't a callseq node between the call /// chain and the load. static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { + // The transformation is somewhat dangerous if the call's chain was glued to + // the call. After MoveBelowOrigChain the load is moved between the call and + // the chain, this can create a cycle if the load is not folded. So it is + // *really* important that we are sure the load will be folded. if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) return false; LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); @@ -416,6 +420,11 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { if (!Chain.getNumOperands()) return false; + // Since we are not checking for AA here, conservatively abort if the chain + // writes to memory. It's not safe to move the callee (a load) across a store. + if (isa<MemSDNode>(Chain.getNode()) && + cast<MemSDNode>(Chain.getNode())->writeMem()) + return false; if (Chain.getOperand(0).getNode() == Callee.getNode()) return true; if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && @@ -427,7 +436,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize); + OptForSize = MF->getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { @@ -435,7 +445,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if (OptLevel != CodeGenOpt::None && (N->getOpcode() == X86ISD::CALL || - N->getOpcode() == X86ISD::TC_RETURN)) { + (N->getOpcode() == X86ISD::TC_RETURN && + // Only does this if load can be foled into TC_RETURN. + (Subtarget->is64Bit() || + getTargetMachine().getRelocationModel() != Reloc::PIC_)))) { /// Also try moving call address load from outside callseq_start to just /// before the call to allow it to be folded. /// @@ -654,10 +667,16 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); - } else { - AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress(); - AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags(); - } + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.BlockAddr = BA->getBlockAddress(); + AM.SymbolFlags = BA->getTargetFlags(); + if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + AM = Backup; + return true; + } + } else + llvm_unreachable("Unhandled symbol reference node."); if (N.getOpcode() == X86ISD::WrapperRIP) AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); @@ -686,10 +705,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); - } else { - AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress(); - AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags(); - } + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + AM.BlockAddr = BA->getBlockAddress(); + AM.Disp += BA->getOffset(); + AM.SymbolFlags = BA->getTargetFlags(); + } else + llvm_unreachable("Unhandled symbol reference node."); return false; } @@ -1021,8 +1042,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.IndexReg = ShVal; return false; } - break; } + break; case ISD::SRL: { // Scale must not be used already. @@ -1283,7 +1304,9 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, // that are not a MemSDNode, and thus don't have proper addrspace info. Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores - Parent->getOpcode() != X86ISD::TLSCALL) { // Fixme + Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp + Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS. @@ -1470,6 +1493,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { SDValue In1 = Node->getOperand(1); SDValue In2L = Node->getOperand(2); SDValue In2H = Node->getOperand(3); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) return NULL; @@ -1483,159 +1507,13 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { return ResNode; } -// FIXME: Figure out some way to unify this with the 'or' and other code -// below. -SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { - if (Node->hasAnyUseOfValue(0)) - return 0; - - // Optimize common patterns for __sync_add_and_fetch and - // __sync_sub_and_fetch where the result is not used. This allows us - // to use "lock" version of add, sub, inc, dec instructions. - // FIXME: Do not use special instructions but instead add the "lock" - // prefix to the target node somehow. The extra information will then be - // transferred to machine instruction and it denotes the prefix. - SDValue Chain = Node->getOperand(0); - SDValue Ptr = Node->getOperand(1); - SDValue Val = Node->getOperand(2); - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) - return 0; - - bool isInc = false, isDec = false, isSub = false, isCN = false; - ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); - if (CN && CN->getSExtValue() == (int32_t)CN->getSExtValue()) { - isCN = true; - int64_t CNVal = CN->getSExtValue(); - if (CNVal == 1) - isInc = true; - else if (CNVal == -1) - isDec = true; - else if (CNVal >= 0) - Val = CurDAG->getTargetConstant(CNVal, NVT); - else { - isSub = true; - Val = CurDAG->getTargetConstant(-CNVal, NVT); - } - } else if (Val.hasOneUse() && - Val.getOpcode() == ISD::SUB && - X86::isZeroNode(Val.getOperand(0))) { - isSub = true; - Val = Val.getOperand(1); - } - - DebugLoc dl = Node->getDebugLoc(); - unsigned Opc = 0; - switch (NVT.getSimpleVT().SimpleTy) { - default: return 0; - case MVT::i8: - if (isInc) - Opc = X86::LOCK_INC8m; - else if (isDec) - Opc = X86::LOCK_DEC8m; - else if (isSub) { - if (isCN) - Opc = X86::LOCK_SUB8mi; - else - Opc = X86::LOCK_SUB8mr; - } else { - if (isCN) - Opc = X86::LOCK_ADD8mi; - else - Opc = X86::LOCK_ADD8mr; - } - break; - case MVT::i16: - if (isInc) - Opc = X86::LOCK_INC16m; - else if (isDec) - Opc = X86::LOCK_DEC16m; - else if (isSub) { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB16mi8; - else - Opc = X86::LOCK_SUB16mi; - } else - Opc = X86::LOCK_SUB16mr; - } else { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD16mi8; - else - Opc = X86::LOCK_ADD16mi; - } else - Opc = X86::LOCK_ADD16mr; - } - break; - case MVT::i32: - if (isInc) - Opc = X86::LOCK_INC32m; - else if (isDec) - Opc = X86::LOCK_DEC32m; - else if (isSub) { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB32mi8; - else - Opc = X86::LOCK_SUB32mi; - } else - Opc = X86::LOCK_SUB32mr; - } else { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD32mi8; - else - Opc = X86::LOCK_ADD32mi; - } else - Opc = X86::LOCK_ADD32mr; - } - break; - case MVT::i64: - if (isInc) - Opc = X86::LOCK_INC64m; - else if (isDec) - Opc = X86::LOCK_DEC64m; - else if (isSub) { - Opc = X86::LOCK_SUB64mr; - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB64mi8; - else if (i64immSExt32(Val.getNode())) - Opc = X86::LOCK_SUB64mi32; - } - } else { - Opc = X86::LOCK_ADD64mr; - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD64mi8; - else if (i64immSExt32(Val.getNode())) - Opc = X86::LOCK_ADD64mi32; - } - } - break; - } - - SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, NVT), 0); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); - if (isInc || isDec) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; - SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0); - cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); - SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); - } else { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; - SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); - cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); - SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); - } -} - +/// Atomic opcode table +/// enum AtomicOpc { + ADD, + SUB, + INC, + DEC, OR, AND, XOR, @@ -1659,6 +1537,58 @@ enum AtomicSz { static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { { + X86::LOCK_ADD8mi, + X86::LOCK_ADD8mr, + X86::LOCK_ADD16mi8, + X86::LOCK_ADD16mi, + X86::LOCK_ADD16mr, + X86::LOCK_ADD32mi8, + X86::LOCK_ADD32mi, + X86::LOCK_ADD32mr, + X86::LOCK_ADD64mi8, + X86::LOCK_ADD64mi32, + X86::LOCK_ADD64mr, + }, + { + X86::LOCK_SUB8mi, + X86::LOCK_SUB8mr, + X86::LOCK_SUB16mi8, + X86::LOCK_SUB16mi, + X86::LOCK_SUB16mr, + X86::LOCK_SUB32mi8, + X86::LOCK_SUB32mi, + X86::LOCK_SUB32mr, + X86::LOCK_SUB64mi8, + X86::LOCK_SUB64mi32, + X86::LOCK_SUB64mr, + }, + { + 0, + X86::LOCK_INC8m, + 0, + 0, + X86::LOCK_INC16m, + 0, + 0, + X86::LOCK_INC32m, + 0, + 0, + X86::LOCK_INC64m, + }, + { + 0, + X86::LOCK_DEC8m, + 0, + 0, + X86::LOCK_DEC16m, + 0, + 0, + X86::LOCK_DEC32m, + 0, + 0, + X86::LOCK_DEC64m, + }, + { X86::LOCK_OR8mi, X86::LOCK_OR8mr, X86::LOCK_OR16mi8, @@ -1669,7 +1599,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { X86::LOCK_OR32mr, X86::LOCK_OR64mi8, X86::LOCK_OR64mi32, - X86::LOCK_OR64mr + X86::LOCK_OR64mr, }, { X86::LOCK_AND8mi, @@ -1682,7 +1612,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { X86::LOCK_AND32mr, X86::LOCK_AND64mi8, X86::LOCK_AND64mi32, - X86::LOCK_AND64mr + X86::LOCK_AND64mr, }, { X86::LOCK_XOR8mi, @@ -1695,18 +1625,74 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { X86::LOCK_XOR32mr, X86::LOCK_XOR64mi8, X86::LOCK_XOR64mi32, - X86::LOCK_XOR64mr + X86::LOCK_XOR64mr, } }; +// Return the target constant operand for atomic-load-op and do simple +// translations, such as from atomic-load-add to lock-sub. The return value is +// one of the following 3 cases: +// + target-constant, the operand could be supported as a target constant. +// + empty, the operand is not needed any more with the new op selected. +// + non-empty, otherwise. +static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, + DebugLoc dl, + enum AtomicOpc &Op, EVT NVT, + SDValue Val) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { + int64_t CNVal = CN->getSExtValue(); + // Quit if not 32-bit imm. + if ((int32_t)CNVal != CNVal) + return Val; + // For atomic-load-add, we could do some optimizations. + if (Op == ADD) { + // Translate to INC/DEC if ADD by 1 or -1. + if ((CNVal == 1) || (CNVal == -1)) { + Op = (CNVal == 1) ? INC : DEC; + // No more constant operand after being translated into INC/DEC. + return SDValue(); + } + // Translate to SUB if ADD by negative value. + if (CNVal < 0) { + Op = SUB; + CNVal = -CNVal; + } + } + return CurDAG->getTargetConstant(CNVal, NVT); + } + + // If the value operand is single-used, try to optimize it. + if (Op == ADD && Val.hasOneUse()) { + // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x). + if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) { + Op = SUB; + return Val.getOperand(1); + } + // A special case for i16, which needs truncating as, in most cases, it's + // promoted to i32. We will translate + // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x)) + if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 && + Val.getOperand(0).getOpcode() == ISD::SUB && + X86::isZeroNode(Val.getOperand(0).getOperand(0))) { + Op = SUB; + Val = Val.getOperand(0); + return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT, + Val.getOperand(1)); + } + } + + return Val; +} + SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; + DebugLoc dl = Node->getDebugLoc(); + // Optimize common patterns for __sync_or_and_fetch and similar arith // operations where the result is not used. This allows us to use the "lock" // version of the arithmetic instruction. - // FIXME: Same as for 'add' and 'sub', try to merge those down here. SDValue Chain = Node->getOperand(0); SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); @@ -1717,6 +1703,8 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { // Which index into the table. enum AtomicOpc Op; switch (Node->getOpcode()) { + default: + return 0; case ISD::ATOMIC_LOAD_OR: Op = OR; break; @@ -1726,16 +1714,14 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { case ISD::ATOMIC_LOAD_XOR: Op = XOR; break; - default: - return 0; - } - - bool isCN = false; - ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); - if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) { - isCN = true; - Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT); + case ISD::ATOMIC_LOAD_ADD: + Op = ADD; + break; } + + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); + bool isUnOp = !Val.getNode(); + bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); unsigned Opc = 0; switch (NVT.getSimpleVT().SimpleTy) { @@ -1777,13 +1763,20 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { assert(Opc != 0 && "Invalid arith lock transform!"); - DebugLoc dl = Node->getDebugLoc(); + SDValue Ret; SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, NVT), 0); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; - SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + if (isUnOp) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, + array_lengthof(Ops)), 0); + } else { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, + array_lengthof(Ops)), 0); + } cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); SDValue RetVals[] = { Undef, Ret }; return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); @@ -2061,6 +2054,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::ATOMSUB64_DAG: case X86ISD::ATOMNAND64_DAG: case X86ISD::ATOMAND64_DAG: + case X86ISD::ATOMMAX64_DAG: + case X86ISD::ATOMMIN64_DAG: + case X86ISD::ATOMUMAX64_DAG: + case X86ISD::ATOMUMIN64_DAG: case X86ISD::ATOMSWAP64_DAG: { unsigned Opc; switch (Opcode) { @@ -2071,6 +2068,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; + case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; + case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; + case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; + case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; } SDNode *RetVal = SelectAtomic64(Node, Opc); @@ -2079,15 +2080,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { break; } - case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT); - if (RetVal) - return RetVal; - break; - } case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: { + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_ADD: { SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); if (RetVal) return RetVal; @@ -2202,13 +2198,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); bool isSigned = Opcode == ISD::SMUL_LOHI; + bool hasBMI2 = Subtarget->hasBMI2(); if (!isSigned) { switch (NVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; - case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; - case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; + case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; + MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; + case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; + MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; } } else { switch (NVT.getSimpleVT().SimpleTy) { @@ -2220,13 +2219,31 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } } - unsigned LoReg, HiReg; - switch (NVT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break; - case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break; - case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break; - case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break; + unsigned SrcReg, LoReg, HiReg; + switch (Opc) { + default: llvm_unreachable("Unknown MUL opcode!"); + case X86::IMUL8r: + case X86::MUL8r: + SrcReg = LoReg = X86::AL; HiReg = X86::AH; + break; + case X86::IMUL16r: + case X86::MUL16r: + SrcReg = LoReg = X86::AX; HiReg = X86::DX; + break; + case X86::IMUL32r: + case X86::MUL32r: + SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; + break; + case X86::IMUL64r: + case X86::MUL64r: + SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; + break; + case X86::MULX32rr: + SrcReg = X86::EDX; LoReg = HiReg = 0; + break; + case X86::MULX64rr: + SrcReg = X86::RDX; LoReg = HiReg = 0; + break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; @@ -2238,22 +2255,47 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { std::swap(N0, N1); } - SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, N0, SDValue()).getValue(1); + SDValue ResHi, ResLo; if (foldedLoad) { + SDValue Chain; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, - array_lengthof(Ops)); - InFlag = SDValue(CNode, 1); + if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, + array_lengthof(Ops)); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + Chain = SDValue(CNode, 2); + InFlag = SDValue(CNode, 3); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, + array_lengthof(Ops)); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); + } // Update the chain. - ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + ReplaceUses(N1.getValue(1), Chain); } else { - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag); - InFlag = SDValue(CNode, 0); + SDValue Ops[] = { N1, InFlag }; + if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, + array_lengthof(Ops)); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + InFlag = SDValue(CNode, 2); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, + array_lengthof(Ops)); + InFlag = SDValue(CNode, 0); + } } // Prevent use of AH in a REX instruction by referencing AX instead. @@ -2278,19 +2320,25 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - LoReg, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 0), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + if (ResLo.getNode() == 0) { + assert(LoReg && "Register for low half is not defined!"); + ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, + InFlag); + InFlag = ResLo.getValue(2); + } + ReplaceUses(SDValue(Node, 0), ResLo); + DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - HiReg, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 1), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + if (ResHi.getNode() == 0) { + assert(HiReg && "Register for high half is not defined!"); + ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, + InFlag); + InFlag = ResHi.getValue(2); + } + ReplaceUses(SDValue(Node, 1), ResHi); + DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } return NULL; @@ -2491,7 +2539,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i8, Reg); // Emit a testb. - return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testl %eax, $2048" to "testb %ah, $8". @@ -2522,8 +2576,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only // target GR8_NOREX registers, so make sure the register class is // forced. - return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32, - Subreg, ShiftedImm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, + MVT::i32, Subreg, ShiftedImm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testl %eax, $32776" to "testw %ax, $32776". @@ -2539,7 +2598,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i16, Reg); // Emit a testw. - return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testq %rax, $268468232" to "testl %eax, $268468232". @@ -2555,7 +2620,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i32, Reg); // Emit a testl. - return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } } break; @@ -2610,85 +2681,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return Result; } - - // FIXME: Custom handling because TableGen doesn't support multiple implicit - // defs in an instruction pattern - case X86ISD::PCMPESTRI: { - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - SDValue N2 = Node->getOperand(2); - SDValue N3 = Node->getOperand(3); - SDValue N4 = Node->getOperand(4); - - // Make sure last argument is a constant - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4); - if (!Cst) - break; - - uint64_t Imm = Cst->getZExtValue(); - - SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, - X86::EAX, N1, SDValue()).getValue(1); - InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, - N3, InFlag).getValue(1); - - SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag }; - unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : - X86::PCMPESTRIrr; - InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops, - array_lengthof(Ops)), 0); - - if (!SDValue(Node, 0).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::ECX, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 0), Result); - } - if (!SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::EFLAGS, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 1), Result); - } - - return NULL; - } - - // FIXME: Custom handling because TableGen doesn't support multiple implicit - // defs in an instruction pattern - case X86ISD::PCMPISTRI: { - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - SDValue N2 = Node->getOperand(2); - - // Make sure last argument is a constant - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2); - if (!Cst) - break; - - uint64_t Imm = Cst->getZExtValue(); - - SDValue Ops[] = { N0, N1, getI8Imm(Imm) }; - unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : - X86::PCMPISTRIrr; - SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops, - array_lengthof(Ops)), 0); - - if (!SDValue(Node, 0).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::ECX, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 0), Result); - } - if (!SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::EFLAGS, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 1), Result); - } - - return NULL; - } } SDNode *ResNode = SelectCode(Node); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5c525ae..4ab92ad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14,20 +14,15 @@ #define DEBUG_TYPE "x86-isel" #include "X86ISelLowering.h" +#include "Utils/X86ShuffleDecode.h" #include "X86.h" #include "X86InstrBuilder.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" -#include "Utils/X86ShuffleDecode.h" -#include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/GlobalAlias.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -35,14 +30,19 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/VariadicFunction.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -158,10 +158,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); - TD = getTargetData(); + TD = getDataLayout(); // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -180,11 +179,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); - setStackPointerRegisterToSaveRestore(X86StackPtr); + setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass i32 with i8 on Atom when compiling with O2 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) - addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext())); + addBypassSlowDiv(32, 8); if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. @@ -457,6 +456,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); @@ -514,6 +521,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); } if (Subtarget->hasCmpxchg16b()) { @@ -545,6 +556,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); @@ -647,7 +659,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f32 , Expand); setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f32 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 @@ -711,74 +725,79 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (int VT = MVT::FIRST_VECTOR_VALUETYPE; - VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) { - setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); - setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); - setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); - setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); - setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + setOperationAction(ISD::ADD , VT, Expand); + setOperationAction(ISD::SUB , VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::MUL , VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::LOAD, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); + setOperationAction(ISD::TRUNCATE, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND, VT, Expand); + setOperationAction(ISD::ZERO_EXTEND, VT, Expand); + setOperationAction(ISD::ANY_EXTEND, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, + setTruncStoreAction(VT, (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -851,6 +870,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); @@ -933,6 +953,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + // As there is no 64-bit GPR available, we need build a special custom + // sequence to convert from v2i32 to v2f32. + if (!Subtarget->is64Bit()) + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } @@ -949,7 +979,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + setOperationAction(ISD::FRINT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -992,7 +1030,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); - if (Subtarget->hasAVX2()) { + if (Subtarget->hasInt256()) { setOperationAction(ISD::SRL, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v4i32, Legal); @@ -1011,7 +1049,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); @@ -1029,6 +1067,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); setOperationAction(ISD::FABS, MVT::v8f32, Custom); @@ -1038,13 +1080,26 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); + setOperationAction(ISD::FRINT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); @@ -1070,16 +1125,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { - setOperationAction(ISD::FMA, MVT::v8f32, Custom); - setOperationAction(ISD::FMA, MVT::v4f64, Custom); - setOperationAction(ISD::FMA, MVT::v4f32, Custom); - setOperationAction(ISD::FMA, MVT::v2f64, Custom); - setOperationAction(ISD::FMA, MVT::f32, Custom); - setOperationAction(ISD::FMA, MVT::f64, Custom); + setOperationAction(ISD::FMA, MVT::v8f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f64, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f64, Legal); + setOperationAction(ISD::FMA, MVT::f32, Legal); + setOperationAction(ISD::FMA, MVT::f64, Legal); } - if (Subtarget->hasAVX2()) { + if (Subtarget->hasInt256()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); setOperationAction(ISD::ADD, MVT::v16i16, Legal); @@ -1185,7 +1247,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // @@ -1235,10 +1296,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1262,13 +1321,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setPrefFunctionAlignment(4); // 2^4 bytes. } - EVT X86TargetLowering::getSetCCResultType(EVT VT) const { if (!VT.isVector()) return MVT::i8; return VT.changeVectorElementTypeToInteger(); } - /// getMaxByValAlign - Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { @@ -1318,34 +1375,30 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, -/// probably because the source does not need to be loaded. If -/// 'IsZeroVal' is true, that means it's safe to return a -/// non-scalar-integer type, e.g. empty string source, constant, or loaded -/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is -/// constant so it does not need to be loaded. +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsZeroVal, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - // FIXME: This turns off use of xmm stores for memset/memcpy on targets like - // linux. This is because the stack realignment code can't handle certain - // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); - if (IsZeroVal && - !F->hasFnAttr(Attribute::NoImplicitFloat)) { + if ((!IsMemset || ZeroMemset) && + !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16))) && - Subtarget->getStackAlignment() >= 16) { - if (Subtarget->getStackAlignment() >= 32) { - if (Subtarget->hasAVX2()) + (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Size >= 32) { + if (Subtarget->hasInt256()) return MVT::v8i32; - if (Subtarget->hasAVX()) + if (Subtarget->hasFp256()) return MVT::v8f32; } if (Subtarget->hasSSE2()) @@ -1354,7 +1407,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && - Subtarget->getStackAlignment() >= 8 && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. @@ -1366,6 +1418,21 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::i32; } +bool X86TargetLowering::isSafeMemOpType(MVT VT) const { + if (VT == MVT::f32) + return X86ScalarSSEf32; + else if (VT == MVT::f64) + return X86ScalarSSEf64; + return true; +} + +bool +X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { + if (Fast) + *Fast = Subtarget->isUnalignedMemAccessFast(); + return true; +} + /// getJumpTableEncoding - Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. @@ -1419,10 +1486,10 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> -X86TargetLowering::findRepresentativeClass(EVT VT) const{ +X86TargetLowering::findRepresentativeClass(MVT VT) const{ const TargetRegisterClass *RRC = 0; uint8_t Cost = 1; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: @@ -1464,7 +1531,6 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } - //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -1636,8 +1702,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -EVT -X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, +MVT +X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? @@ -1646,7 +1712,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, else ReturnMVT = MVT::i32; - EVT MinVT = getRegisterType(Context, ReturnMVT); + MVT MinVT = getRegisterType(ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } @@ -1668,7 +1734,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); @@ -1712,7 +1778,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, return Chain; } - //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// @@ -1776,7 +1841,8 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, /// IsTailCallConvention - Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { - return (CC == CallingConv::Fast || CC == CallingConv::GHC); + return (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { @@ -1863,7 +1929,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool IsWin64 = Subtarget->isTargetWin64(); assert(!(isVarArg && IsTailCallConvention(CallConv)) && - "Var args not supported with calling convention fastcc or ghc"); + "Var args not supported with calling convention fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -1925,10 +1991,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. - if (RegVT.isVector()) { - ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), - ArgValue); - } else + if (RegVT.isVector()) + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); + else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { @@ -2004,7 +2069,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); - bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && @@ -2152,16 +2218,14 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, - bool Is64Bit, int FPDiff, DebugLoc dl) { + SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, + unsigned SlotSize, int FPDiff, DebugLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. - int SlotSize = Is64Bit ? 8 : 4; int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); - EVT VT = Is64Bit ? MVT::i64 : MVT::i32; - SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), false, false, 0); @@ -2196,7 +2260,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, - MF.getFunction()->hasStructRetAttr(), + MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require @@ -2209,7 +2273,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } assert(!(isVarArg && IsTailCallConvention(CallConv)) && - "Var args not supported with calling convention fastcc or ghc"); + "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -2236,14 +2300,15 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FPDiff = 0; if (isTailCall && !IsSibcall) { // Lower arguments at fp - stackoffset + fpdiff. - unsigned NumBytesCallerPushed = - MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); + unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); + FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. - if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) - MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + if (FPDiff < X86Info->getTCReturnAddrDelta()) + X86Info->setTCReturnAddrDelta(FPDiff); } if (!IsSibcall) @@ -2320,7 +2385,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } @@ -2408,7 +2474,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), getPointerTy()); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); @@ -2430,7 +2497,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, &MemOpChains2[0], MemOpChains2.size()); // Store the return address to the appropriate stack slot. - Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, + getPointerTy(), RegInfo->getSlotSize(), FPDiff, dl); } @@ -2480,7 +2548,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && - cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { + cast<Function>(GV)->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -2598,7 +2668,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ins, dl, DAG, InVals); } - //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// @@ -2641,7 +2710,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; - uint64_t SlotSize = TD->getPointerSize(); + unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); @@ -2716,6 +2785,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, + Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -2727,6 +2797,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2899,7 +2976,6 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, return X86::createFastISel(funcInfo, libInfo); } - //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// @@ -3001,7 +3077,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. - uint64_t SlotSize = TD->getPointerSize(); + unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); @@ -3010,7 +3086,6 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); } - bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. @@ -3061,6 +3136,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, return TailCallOpt; case CallingConv::GHC: return TailCallOpt; + case CallingConv::HiPE: + return TailCallOpt; } } @@ -3191,9 +3268,7 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { - if (Val < 0 || Val == CmpVal) - return true; - return false; + return (Val < 0 || Val == CmpVal); } /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning @@ -3220,8 +3295,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { - if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Lower quadword copied in order or undef. @@ -3249,8 +3324,8 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { - if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { + if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Upper quadword copied in order. @@ -3281,7 +3356,7 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, const X86Subtarget *Subtarget) { if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || - (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) + (VT.getSizeInBits() == 256 && !Subtarget->hasInt256())) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -3368,9 +3443,9 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, +static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, bool Commuted = false) { - if (!HasAVX && VT.getSizeInBits() == 256) + if (!HasFp256 && VT.getSizeInBits() == 256) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -3527,7 +3602,7 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, if (!MatchEvenMask && !MatchOddMask) return SDValue(); - + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); SDValue Op0 = SVOp->getOperand(0); @@ -3549,14 +3624,14 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, - bool HasAVX2, bool V2IsSplat = false) { + bool HasInt256, bool V2IsSplat = false) { unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && - (!HasAVX2 || (NumElts != 16 && NumElts != 32))) + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3588,14 +3663,14 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, - bool HasAVX2, bool V2IsSplat = false) { + bool HasInt256, bool V2IsSplat = false) { unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && - (!HasAVX2 || (NumElts != 16 && NumElts != 32))) + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3626,14 +3701,14 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, - bool HasAVX2) { + bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && - (!HasAVX2 || (NumElts != 16 && NumElts != 32))) + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern @@ -3668,14 +3743,14 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && - (!HasAVX2 || (NumElts != 16 && NumElts != 32))) + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3724,8 +3799,8 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { - if (!HasAVX || !VT.is256BitVector()) +static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256 || !VT.is256BitVector()) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3784,8 +3859,8 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { - if (!HasAVX) +static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -3885,8 +3960,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { - if (!HasAVX || !VT.is256BitVector()) +static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { + if (!HasFp256 || !VT.is256BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -4291,7 +4366,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (Size == 256) { // AVX - if (Subtarget->hasAVX2()) { // AVX2 + if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); @@ -4312,7 +4387,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, +static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); unsigned Size = VT.getSizeInBits(); @@ -4320,7 +4395,7 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; if (Size == 256) { - if (HasAVX2) { // AVX2 + if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); } else { // AVX @@ -4594,7 +4669,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, MVT ShufVT = V.getValueType().getSimpleVT(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; - SDValue ImmN; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) @@ -5021,8 +5095,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. SDValue -X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { - if (!Subtarget->hasAVX()) +X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasFp256()) return SDValue(); EVT VT = Op.getValueType(); @@ -5068,7 +5142,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { - if (!Subtarget->hasAVX2()) + if (!Subtarget->hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. @@ -5095,7 +5169,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasAVX2()) { + if (ConstSplatVal && Subtarget->hasInt256()) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); @@ -5123,7 +5197,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. - if (!IsLoad && Subtarget->hasAVX2() && + if (!IsLoad && Subtarget->hasInt256() && (ScalarSize == 32 || (Is256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); @@ -5136,7 +5210,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm - if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { + if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5145,84 +5219,78 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64 -// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the -// constraint of matching input/output vector elements. SDValue -X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); - SDNode *N = Op.getNode(); +X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - unsigned NumElts = Op.getNumOperands(); - // Check supported types and sub-targets. - // - // Only v2f32 -> v2f64 needs special handling. - if (VT != MVT::v2f64 || !Subtarget->hasSSE2()) + // Skip if insert_vec_elt is not supported. + if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); - SDValue VecIn; - EVT VecInVT; - SmallVector<int, 8> Mask; - EVT SrcVT = MVT::Other; + DebugLoc DL = Op.getDebugLoc(); + unsigned NumElems = Op.getNumOperands(); - // Check the patterns could be translated into X86vfpext. - for (unsigned i = 0; i < NumElts; ++i) { - SDValue In = N->getOperand(i); - unsigned Opcode = In.getOpcode(); + SDValue VecIn1; + SDValue VecIn2; + SmallVector<unsigned, 4> InsertIndices; + SmallVector<int, 8> Mask(NumElems, -1); - // Skip if the element is undefined. - if (Opcode == ISD::UNDEF) { - Mask.push_back(-1); + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Opc = Op.getOperand(i).getOpcode(); + + if (Opc == ISD::UNDEF) continue; - } - // Quit if one of the elements is not defined from 'fpext'. - if (Opcode != ISD::FP_EXTEND) - return SDValue(); + if (Opc != ISD::EXTRACT_VECTOR_ELT) { + // Quit if more than 1 elements need inserting. + if (InsertIndices.size() > 1) + return SDValue(); - // Check how the source of 'fpext' is defined. - SDValue L2In = In.getOperand(0); - EVT L2InVT = L2In.getValueType(); + InsertIndices.push_back(i); + continue; + } - // Check the original type - if (SrcVT == MVT::Other) - SrcVT = L2InVT; - else if (SrcVT != L2InVT) // Quit if non-homogenous typed. - return SDValue(); + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); + SDValue ExtIdx = Op.getOperand(i).getOperand(1); - // Check whether the value being 'fpext'ed is extracted from the same - // source. - Opcode = L2In.getOpcode(); + // Quit if extracted from vector of different type. + if (ExtractedFromVec.getValueType() != VT) + return SDValue(); - // Quit if it's not extracted with a constant index. - if (Opcode != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(L2In.getOperand(1))) + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) return SDValue(); - SDValue ExtractedFromVec = L2In.getOperand(0); + if (VecIn1.getNode() == 0) + VecIn1 = ExtractedFromVec; + else if (VecIn1 != ExtractedFromVec) { + if (VecIn2.getNode() == 0) + VecIn2 = ExtractedFromVec; + else if (VecIn2 != ExtractedFromVec) + // Quit if more than 2 vectors to shuffle + return SDValue(); + } - if (VecIn.getNode() == 0) { - VecIn = ExtractedFromVec; - VecInVT = ExtractedFromVec.getValueType(); - } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec. - return SDValue(); + unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); - Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue()); + if (ExtractedFromVec == VecIn1) + Mask[i] = Idx; + else if (ExtractedFromVec == VecIn2) + Mask[i] = Idx + NumElems; } - // Quit if all operands of BUILD_VECTOR are undefined. - if (!VecIn.getNode()) + if (VecIn1.getNode() == 0) return SDValue(); - // Fill the remaining mask as undef. - for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i) - Mask.push_back(-1); + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { + unsigned Idx = InsertIndices[i]; + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), + DAG.getIntPtrConstant(Idx)); + } - return DAG.getNode(X86ISD::VFPEXT, DL, VT, - DAG.getVectorShuffle(VecInVT, DL, - VecIn, DAG.getUNDEF(VecInVT), - &Mask[0])); + return NV; } SDValue @@ -5247,20 +5315,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; - return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } SDValue Broadcast = LowerVectorBroadcast(Op, DAG); if (Broadcast.getNode()) return Broadcast; - SDValue FpExt = LowerVectorFpExtend(Op, DAG); - if (FpExt.getNode()) - return FpExt; - unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5505,6 +5569,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (LD.getNode()) return LD; + // Check for a build vector from mostly shuffle plus few inserting. + SDValue Sh = buildFromShuffleMostly(Op, DAG); + if (Sh.getNode()) + return Sh; + // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; @@ -5571,8 +5640,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -SDValue -X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors @@ -5581,70 +5649,59 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { } // Try to lower a shuffle node into a simple blend instruction. -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue +LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + EVT VT = SVOp->getValueType(0); + EVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); - if (!Subtarget->hasSSE41()) + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) return SDValue(); - unsigned ISDNo = 0; - MVT OpTy; - - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v8i16: - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v8i16; - break; - case MVT::v4i32: - case MVT::v4f32: - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v4f32; - break; - case MVT::v2i64: - case MVT::v2f64: - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v2f64; - break; - case MVT::v8i32: - case MVT::v8f32: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v8f32; - break; - case MVT::v4i64: - case MVT::v4f64: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v4f64; - break; - } - assert(ISDNo && "Invalid Op Number"); + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; - unsigned MaskVals = 0; + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { - for (unsigned i = 0; i != NumElems; ++i) { + int SndLaneEltIdx = (NumLanes == 2) ? + SVOp->getMaskElt(i + NumElemsInLane) : -1; int EltIdx = SVOp->getMaskElt(i); - if (EltIdx == (int)i || EltIdx < 0) - MaskVals |= (1<<i); - else if (EltIdx == (int)(i + NumElems)) - continue; // Bit is set to zero; - else + + if ((EltIdx == -1 || EltIdx == (int)i) && + (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + continue; + + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx == -1 || + (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) + MaskValue |= (1<<i); + else return SDValue(); } - V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); - SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, - DAG.getConstant(MaskVals, MVT::i32)); + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + EVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } @@ -5653,9 +5710,9 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, - SelectionDAG &DAG) const { +static SDValue +LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -6028,8 +6085,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // v32i8 shuffles - Translate to VPSHUFB if possible. static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, - const X86TargetLowering &TLI) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = SVOp->getValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -6040,11 +6097,11 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); - // VPSHUFB may be generated if + // VPSHUFB may be generated if // (1) one of input vector is undefined or zeroinitializer. // The mask value 0x80 puts 0 in the corresponding slot of the vector. // And (2) the mask indexes don't cross the 128-bit lane. - if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() || + if (VT != MVT::v32i8 || !Subtarget->hasInt256() || (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) return SDValue(); @@ -6404,34 +6461,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } static bool MayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) // BUILD_VECTOR (load), undef V = V.getOperand(0); - if (MayFoldLoad(V)) - return true; - return false; -} -// FIXME: the version above should always be used. Since there's -// a bug where several vector shuffles can't be folded because the -// DAG is not updated during lowering and a node claims to have two -// uses while it only has one, use this version, and let isel match -// another instruction if the load really happens to have more than -// one use. Remove this version after this bug get fixed. -// rdar://8434668, PR8156 -static bool RelaxedMayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) - V = V.getOperand(0); - if (ISD::isNormalLoad(V.getNode())) - return true; - return false; + return MayFoldLoad(V); } static @@ -6537,6 +6577,81 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +// Reduce a vector shuffle to zext. +SDValue +X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { + // PMOVZX is only available from SSE41. + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // Only AVX2 support 256-bit vector integer extending. + if (!Subtarget->hasInt256() && VT.is256BitVector()) + return SDValue(); + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + DebugLoc DL = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + + // Extending is an unary operation and the element type of the source vector + // won't be equal to or larger than i64. + if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || + VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. + unsigned Shift = 1; // Start from 2, i.e. 1 << 1. + while ((1U << Shift) < NumElems) { + if (SVOp->getMaskElt(1U << Shift) == 1) + break; + Shift += 1; + // The maximal ratio is 8, i.e. from i8 to i64. + if (Shift > 3) + return SDValue(); + } + + // Check the shuffle mask. + unsigned Mask = (1U << Shift) - 1; + for (unsigned i = 0; i != NumElems; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if ((i & Mask) != 0 && EltIdx != -1) + return SDValue(); + if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) + return SDValue(); + } + + unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; + EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); + + if (!isTypeLegal(NVT)) + return SDValue(); + + // Simplify the operand as it's prepared to be fed into shuffle. + unsigned SignificantBits = NVT.getSizeInBits() >> Shift; + if (V1.getOpcode() == ISD::BITCAST && + V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V1.getOperand(0) + .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); + ConstantSDNode *CIdx = + dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); + // If it's foldable, i.e. normal load with single use, we will let code + // selection to fold it. Otherwise, we will short the conversion sequence. + if (CIdx && CIdx->getZExtValue() == 0 && + (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) + V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); +} + SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); @@ -6560,13 +6675,18 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || - (Size == 256 && NumElem < 8)) + (Size == 256 && NumElem <= 8)) return SDValue(); // All remaning splats are promoted to target supported vector shuffles. return PromoteSplat(SVOp, DAG); } + // Check integer expanding shuffles. + SDValue NewOp = lowerVectorIntExtend(Op, DAG); + if (NewOp.getNode()) + return NewOp; + // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || @@ -6613,10 +6733,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool V1IsSplat = false; bool V2IsSplat = false; bool HasSSE2 = Subtarget->hasSSE2(); - bool HasAVX = Subtarget->hasAVX(); - bool HasAVX2 = Subtarget->hasAVX2(); + bool HasFp256 = Subtarget->hasFp256(); + bool HasInt256 = Subtarget->hasInt256(); MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + bool OptForSize = MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); @@ -6650,20 +6771,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && - V2IsUndef && RelaxedMayFoldVectorLoad(V1)) + V2IsUndef && MayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); if (isMOVHLPS_v_undef_Mask(M, VT)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && + if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); @@ -6676,12 +6797,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); - if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); + if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + DAG); + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, TargetMask, DAG); } @@ -6712,7 +6834,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) + if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) return getMOVLowToHigh(Op, dl, DAG, HasSSE2); if (isMOVHLPSMask(M, VT)) @@ -6762,10 +6884,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (isUNPCKLMask(M, VT, HasAVX2)) + if (isUNPCKLMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (isUNPCKHMask(M, VT, HasAVX2)) + if (isUNPCKHMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { @@ -6774,9 +6896,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // new vector_shuffle with the corrected mask.p SmallVector<int, 8> NewMask(M.begin(), M.end()); NormalizeMask(NewMask, NumElems); - if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) + if (isUNPCKLMask(NewMask, VT, HasInt256, true)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) + if (isUNPCKHMask(NewMask, VT, HasInt256, true)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } @@ -6788,15 +6910,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { std::swap(V1IsSplat, V2IsSplat); Commuted = false; - if (isUNPCKLMask(M, VT, HasAVX2)) + if (isUNPCKLMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (isUNPCKHMask(M, VT, HasAVX2)) + if (isUNPCKHMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -6814,23 +6936,23 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } - if (isPSHUFHWMask(M, VT, HasAVX2)) + if (isPSHUFHWMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, getShufflePSHUFHWImmediate(SVOp), DAG); - if (isPSHUFLWMask(M, VT, HasAVX2)) + if (isPSHUFLWMask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT, HasAVX)) + if (isSHUFPMask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); - if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// @@ -6839,12 +6961,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // // Handle VMOVDDUPY permutations - if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasAVX)) { - if (HasAVX2 && VT == MVT::v8i32) + if (isVPERMILPMask(M, VT, HasFp256)) { + if (HasInt256 && VT == MVT::v8i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, @@ -6852,7 +6974,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Handle VPERM2F128/VPERM2I128 permutations - if (isVPERM2X128Mask(M, VT, HasAVX)) + if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); @@ -6860,7 +6982,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; - if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { + if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { SmallVector<SDValue, 8> permclMask; for (unsigned i = 0; i != 8; ++i) { permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); @@ -6872,11 +6994,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); } - if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64)) + if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, getShuffleCLImmediate(SVOp), DAG); - //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -6885,7 +7006,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -6897,7 +7018,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (VT == MVT::v32i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this); + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -6925,9 +7046,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6942,9 +7063,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6978,7 +7099,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, return SDValue(); } - SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -7028,9 +7148,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Transform it so it match pextrw which produces a 32-bit result. EVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -7173,8 +7293,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT OpVT = Op.getValueType(); @@ -7206,9 +7325,9 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. -SDValue -X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget->hasAVX()) { +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (Subtarget->hasFp256()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); SDValue Idx = Op.getNode()->getOperand(1); @@ -7226,9 +7345,9 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. -SDValue -X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - if (Subtarget->hasAVX()) { +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (Subtarget->hasFp256()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); SDValue SubVec = Op.getNode()->getOperand(1); @@ -7344,7 +7463,6 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); - // With PIC, the address is actually $g + Offset. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { @@ -7370,9 +7488,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = getTargetMachine().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); DebugLoc dl = Op.getDebugLoc(); - SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), - /*isTarget=*/true, OpFlags); + SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, + OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -7481,8 +7600,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue InFlag; DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), PtrVT), InFlag); + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -7730,7 +7849,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("TLS not implemented for this target."); } - /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ @@ -7983,11 +8101,29 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + EVT SVT = N0.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || + SVT == MVT::v8i8 || SVT == MVT::v8i16) && + "Custom UINT_TO_FP is not supported!"); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); +} + SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); + if (Op.getValueType().isVector()) + return lowerUINT_TO_FP_vec(Op, DAG); + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. @@ -8161,10 +8297,217 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co } } +static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = Op->getValueType(0); + SDValue In = Op->getOperand(0); + EVT InVT = In.getValueType(); + DebugLoc dl = Op->getDebugLoc(); + + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + + if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && + ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); + + SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); + SDValue Undef = DAG.getUNDEF(InVT); + bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + +SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + if (Subtarget->hasFp256()) { + SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); + if (Res.getNode()) + return Res; + } + + return SDValue(); +} +SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + if (Subtarget->hasFp256()) { + SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); + if (Res.getNode()) + return Res; + } + + if (!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()) + return SDValue(); + + assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!"); + + // AVX2 has better support of integer extending. + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); + static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; + SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, + DAG.getVectorShuffle(MVT::v8i16, DL, In, + DAG.getUNDEF(MVT::v8i16), + &Mask[0])); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); +} + +SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { + // On AVX2, v4i64 -> v4i32 becomes VPERMD. + if (Subtarget->hasInt256()) { + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), + ShufMask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, + DAG.getIntPtrConstant(0)); + } + + // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + + // The PSHUFD mask: + static const int ShufMask1[] = {0, 2, 0, 0}; + SDValue Undef = DAG.getUNDEF(VT); + OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); + + // The MOVLHPS mask: + static const int ShufMask2[] = {0, 1, 4, 5}; + return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); + } + + if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) { + // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + if (Subtarget->hasInt256()) { + In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); + + SmallVector<SDValue,32> pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, + &pshufbMask[0], 32); + In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); + + static const int ShufMask[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, In); + } + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); + + // The PSHUFB mask: + static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + SDValue Undef = DAG.getUNDEF(MVT::v16i8); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + + OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + + // The MOVLHPS Mask: + static const int ShufMask2[] = {0, 1, 4, 5}; + SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); + } + + // Handle truncation of V256 to V128 using shuffles. + if (!VT.is128BitVector() || !SVT.is256BitVector()) + return SDValue(); + + assert(VT.getVectorNumElements() != SVT.getVectorNumElements() && + "Invalid op"); + assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); + + unsigned NumElems = VT.getVectorNumElements(); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + NumElems * 2); + + SmallVector<int, 16> MaskVec(NumElems * 2, -1); + // Prepare truncation shuffle mask + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = i * 2; + SDValue V = DAG.getVectorShuffle(NVT, DL, + DAG.getNode(ISD::BITCAST, DL, NVT, In), + DAG.getUNDEF(NVT), &MaskVec[0]); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, + DAG.getIntPtrConstant(0)); +} + SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) + if (Op.getValueType().isVector()) { + if (Op.getValueType() == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), + DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + MVT::v8i32, Op.getOperand(0))); return SDValue(); + } std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); @@ -8199,6 +8542,20 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } +SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + In, DAG.getUNDEF(SVT))); +} + SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); @@ -8337,7 +8694,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } -SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -8348,6 +8705,98 @@ SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } +// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. +// +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + DebugLoc DL = N->getDebugLoc(); + + SmallVector<SDValue, 8> Opnds; + DenseMap<SDValue, unsigned> VecInMap; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + } + M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-/256-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + SmallVector<SDValue, 8> VecIns; + + for (DenseMap<SDValue, unsigned>::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + VecIns.push_back(I->first); + } + + EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8487,9 +8936,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: Opcode = X86ISD::SUB; break; - case ISD::OR: Opcode = X86ISD::OR; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } } NumOperands = 2; @@ -8595,6 +9052,11 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +static bool isAllOnes(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isAllOnesValue(); +} + /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, @@ -8643,6 +9105,14 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } if (LHS.getNode()) { + // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip + // the condition code later. + bool Invert = false; + if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) { + Invert = true; + LHS = LHS.getOperand(0); + } + // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because @@ -8658,7 +9128,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); - unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + // Flip the condition if the LHS was a not instruction + if (Invert) + Cond = X86::GetOppositeBranchCondition(Cond); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, MVT::i8), BT); } @@ -8751,7 +9224,6 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } - SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue Cond; SDValue Op0 = Op.getOperand(0); @@ -8829,7 +9301,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { } // Break 256-bit integer vector compare into smaller ones. - if (VT.is256BitVector() && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has @@ -8859,8 +9331,28 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) return SDValue(); - if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) - return SDValue(); + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { + // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with + // pcmpeqd + pshufd + pand. + assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); + + // First cast everything to the right type, + Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + + // Do the compare. + SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); + + // Make sure the lower and upper halves are both all-ones. + const int Mask[] = { 1, 0, 3, 2 }; + SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); + Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } } // Since SSE has no unsigned integer comparisons, we need to flip the sign @@ -8916,11 +9408,6 @@ static bool isZero(SDValue V) { return C && C->isNullValue(); } -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isAllOnesValue(); -} - static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; @@ -9099,6 +9586,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } + // X86 doesn't have an i8 cmov. If both operands are the result of a truncate + // widen the cmov and push the truncate through. This avoids introducing a new + // branch during isel and doesn't add any extensions. + if (Op.getValueType() == MVT::i8 && + Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + // Blacklist CopyFromReg to avoid partial register stalls. + T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ + SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + } + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); @@ -9106,6 +9608,53 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); } +SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op->getValueType(0); + SDValue In = Op->getOperand(0); + EVT InVT = In.getValueType(); + DebugLoc dl = Op->getDebugLoc(); + + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && + (VT != MVT::v8i32 || InVT != MVT::v8i16)) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + unsigned NumElems = InVT.getVectorNumElements(); + SDValue Undef = DAG.getUNDEF(InVT); + + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. @@ -9394,7 +9943,6 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Chain, Dest, CC, Cond); } - // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca is needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure @@ -9453,7 +10001,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Flag = Chain.getValue(1); - Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); + Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + SPTy).getValue(1); SDValue Ops1[2] = { Chain.getValue(0), Chain }; return DAG.getMergeValues(Ops1, 2, dl); @@ -9536,7 +10085,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); + uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. @@ -9556,7 +10105,9 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Sanity Check: Make sure using fp_offset makes sense. assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() - .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && + .getFunction()->getAttributes() + .hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -9587,7 +10138,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); } -SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); SDValue Chain = Op.getOperand(0); @@ -9648,8 +10200,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -SDValue -X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IntNo) { @@ -9761,6 +10312,14 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + // SSE2/AVX2 sub with unsigned saturation intrinsics + case Intrinsic::x86_sse2_psubus_b: + case Intrinsic::x86_sse2_psubus_w: + case Intrinsic::x86_avx2_psubus_b: + case Intrinsic::x86_avx2_psubus_w: + return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // SSE3/AVX horizontal add/sub intrinsics case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: @@ -9810,6 +10369,100 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const Op.getOperand(1), Op.getOperand(2)); } + // SSE2/SSE41/AVX2 integer max/min intrinsics. + case Intrinsic::x86_sse2_pmaxu_b: + case Intrinsic::x86_sse41_pmaxuw: + case Intrinsic::x86_sse41_pmaxud: + case Intrinsic::x86_avx2_pmaxu_b: + case Intrinsic::x86_avx2_pmaxu_w: + case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_sse2_pminu_b: + case Intrinsic::x86_sse41_pminuw: + case Intrinsic::x86_sse41_pminud: + case Intrinsic::x86_avx2_pminu_b: + case Intrinsic::x86_avx2_pminu_w: + case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_sse41_pmaxsb: + case Intrinsic::x86_sse2_pmaxs_w: + case Intrinsic::x86_sse41_pmaxsd: + case Intrinsic::x86_avx2_pmaxs_b: + case Intrinsic::x86_avx2_pmaxs_w: + case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_sse41_pminsb: + case Intrinsic::x86_sse2_pmins_w: + case Intrinsic::x86_sse41_pminsd: + case Intrinsic::x86_avx2_pmins_b: + case Intrinsic::x86_avx2_pmins_w: + case Intrinsic::x86_avx2_pmins_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_pmaxu_b: + case Intrinsic::x86_sse41_pmaxuw: + case Intrinsic::x86_sse41_pmaxud: + case Intrinsic::x86_avx2_pmaxu_b: + case Intrinsic::x86_avx2_pmaxu_w: + case Intrinsic::x86_avx2_pmaxu_d: + Opcode = X86ISD::UMAX; + break; + case Intrinsic::x86_sse2_pminu_b: + case Intrinsic::x86_sse41_pminuw: + case Intrinsic::x86_sse41_pminud: + case Intrinsic::x86_avx2_pminu_b: + case Intrinsic::x86_avx2_pminu_w: + case Intrinsic::x86_avx2_pminu_d: + Opcode = X86ISD::UMIN; + break; + case Intrinsic::x86_sse41_pmaxsb: + case Intrinsic::x86_sse2_pmaxs_w: + case Intrinsic::x86_sse41_pmaxsd: + case Intrinsic::x86_avx2_pmaxs_b: + case Intrinsic::x86_avx2_pmaxs_w: + case Intrinsic::x86_avx2_pmaxs_d: + Opcode = X86ISD::SMAX; + break; + case Intrinsic::x86_sse41_pminsb: + case Intrinsic::x86_sse2_pmins_w: + case Intrinsic::x86_sse41_pminsd: + case Intrinsic::x86_avx2_pmins_b: + case Intrinsic::x86_avx2_pmins_w: + case Intrinsic::x86_avx2_pmins_d: + Opcode = X86ISD::SMIN; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + + // SSE/SSE2/AVX floating point max/min intrinsics. + case Intrinsic::x86_sse_max_ps: + case Intrinsic::x86_sse2_max_pd: + case Intrinsic::x86_avx_max_ps_256: + case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_sse_min_ps: + case Intrinsic::x86_sse2_min_pd: + case Intrinsic::x86_avx_min_ps_256: + case Intrinsic::x86_avx_min_pd_256: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse_max_ps: + case Intrinsic::x86_sse2_max_pd: + case Intrinsic::x86_avx_max_ps_256: + case Intrinsic::x86_avx_max_pd_256: + Opcode = X86ISD::FMAX; + break; + case Intrinsic::x86_sse_min_ps: + case Intrinsic::x86_sse2_min_pd: + case Intrinsic::x86_avx_min_ps_256: + case Intrinsic::x86_avx_min_pd_256: + Opcode = X86ISD::FMIN; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + // AVX2 variable shift intrinsics case Intrinsic::x86_avx2_psllv_d: case Intrinsic::x86_avx2_psllv_q: @@ -9877,6 +10530,12 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); + case Intrinsic::x86_sse_sqrt_ps: + case Intrinsic::x86_sse2_sqrt_pd: + case Intrinsic::x86_avx_sqrt_ps_256: + case Intrinsic::x86_avx_sqrt_pd_256: + return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); + // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. @@ -10186,8 +10845,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const } } -SDValue -X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntNo) { @@ -10225,21 +10883,21 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = getPointerTy(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - DAG.getConstant(TD->getPointerSize(), - Subtarget->is64Bit() ? MVT::i64 : MVT::i32); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo(), false, false, false, 0); } @@ -10261,7 +10919,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - return DAG.getIntPtrConstant(2*TD->getPointerSize()); + return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { @@ -10276,7 +10934,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, - DAG.getIntPtrConstant(TD->getPointerSize())); + DAG.getIntPtrConstant(RegInfo->getSlotSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); @@ -10287,8 +10945,22 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); } -SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } @@ -10301,6 +10973,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, DebugLoc dl = Op.getDebugLoc(); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -10309,8 +10982,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. - const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); - const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); + const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; + const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix @@ -10375,7 +11048,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); - const AttrListPtr &Attrs = Func->getAttributes(); + const AttributeSet &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; @@ -10383,7 +11056,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.paramHasAttr(Idx, Attribute::InReg)) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) // FIXME: should only count parameters that are lowered to integers. InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; @@ -10412,7 +11085,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. - const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); + const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), @@ -10473,7 +11146,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, 2, 2); @@ -10506,12 +11178,11 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, DAG.getConstant(1, MVT::i16)), DAG.getConstant(3, MVT::i16)); - return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10545,8 +11216,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10571,7 +11241,7 @@ SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumBits = VT.getSizeInBits(); DebugLoc dl = Op.getDebugLoc(); @@ -10620,32 +11290,59 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. + if (VT == MVT::v4i32) { + assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && + "Should not custom lower when pmuldq is available!"); + + // Extract the odd parts. + const int UnpackMask[] = { 1, -1, 3, -1 }; + SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); + SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); + + // Multiply the even parts. + SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); + // Now multiply odd parts. + SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); + + Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); + Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); + + // Merge the two vectors back together with a shuffle. This expands into 2 + // shuffles. + const int ShufMask[] = { 0, 4, 2, 6 }; + return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); + } + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Only know how to lower V2I64/V4I64 multiply"); - DebugLoc dl = Op.getDebugLoc(); - // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // @@ -10657,9 +11354,6 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; - SDValue A = Op.getOperand(0); - SDValue B = Op.getOperand(1); - SDValue ShAmt = DAG.getConstant(32, MVT::i32); SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); @@ -10701,7 +11395,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { uint64_t ShiftAmt = C->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || - (Subtarget->hasAVX2() && + (Subtarget->hasInt256() && (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { if (Op.getOpcode() == ISD::SHL) return DAG.getNode(X86ISD::VSHLI, dl, VT, R, @@ -10758,7 +11452,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Unknown shift opcode."); } - if (Subtarget->hasAVX2() && VT == MVT::v32i8) { + if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, @@ -10909,7 +11603,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" @@ -11001,9 +11695,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, default: return SDValue(); case MVT::v8i32: case MVT::v16i16: - if (!Subtarget->hasAVX()) + if (!Subtarget->hasFp256()) return SDValue(); - if (!Subtarget->hasAVX2()) { + if (!Subtarget->hasInt256()) { // needs to be split unsigned NumElems = VT.getVectorNumElements(); @@ -11036,8 +11730,8 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, } } - -SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ +static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. @@ -11082,8 +11776,8 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); } -SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); @@ -11120,8 +11814,8 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } - -SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT T = Op.getValueType(); DebugLoc DL = Op.getDebugLoc(); unsigned Reg = 0; @@ -11152,8 +11846,8 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { return cpOut; } -SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); @@ -11171,8 +11865,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerBITCAST(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && @@ -11192,7 +11885,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, return SDValue(); } -SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); EVT T = Node->getValueType(0); @@ -11265,9 +11958,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); - case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -11275,8 +11968,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -11288,8 +11981,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -11300,7 +11998,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); - case ISD::VACOPY: return LowerVACOPY(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -11309,13 +12007,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); - case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, DAG); @@ -11325,7 +12025,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); - case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::ADDC: case ISD::ADDE: @@ -11386,6 +12086,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG) const { DebugLoc dl = N->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); @@ -11418,6 +12119,29 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::UINT_TO_FP: { + if (N->getOperand(0).getValueType() != MVT::v2i32 && + N->getValueType(0) != MVT::v2f32) + return; + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); + Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + return; + } + case ISD::FP_ROUND: { + if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) + return; + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = N->getOperand(0); @@ -11485,6 +12209,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_SWAP: { unsigned Opc; switch (N->getOpcode()) { @@ -11507,6 +12235,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_XOR: Opc = X86ISD::ATOMXOR64_DAG; break; + case ISD::ATOMIC_LOAD_MAX: + Opc = X86ISD::ATOMMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_MIN: + Opc = X86ISD::ATOMMIN64_DAG; + break; + case ISD::ATOMIC_LOAD_UMAX: + Opc = X86ISD::ATOMUMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_UMIN: + Opc = X86ISD::ATOMUMIN64_DAG; + break; case ISD::ATOMIC_SWAP: Opc = X86ISD::ATOMSWAP64_DAG; break; @@ -11564,13 +12304,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; - case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; - case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; - case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; + case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; + case X86ISD::UMAX: return "X86ISD::UMAX"; + case X86ISD::UMIN: return "X86ISD::UMIN"; + case X86ISD::SMAX: return "X86ISD::SMAX"; + case X86ISD::SMIN: return "X86ISD::SMIN"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; @@ -11580,6 +12323,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; + case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; + case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; @@ -11595,7 +12340,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VZEXT: return "X86ISD::VZEXT"; + case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -11618,7 +12366,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::ANDN: return "X86ISD::ANDN"; case X86ISD::BLSI: return "X86ISD::BLSI"; case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; case X86ISD::BLSR: return "X86ISD::BLSR"; @@ -11662,6 +12409,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; + case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; + case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; } } @@ -11719,24 +12468,21 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } - bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; + return NumBits1 > NumBits2; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { - return Imm == (int32_t)Imm; + return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. - return Imm == (int32_t)Imm; + return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { @@ -11744,9 +12490,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; + return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { @@ -11759,6 +12503,30 @@ bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); } +bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + EVT VT1 = Val.getValueType(); + if (isZExtFree(VT1, VT2)) + return true; + + if (Val.getOpcode() != ISD::LOAD) + return false; + + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + // X86 has 8, 16, and 32-bit zero-extending loads. + return true; + } + + return false; +} + bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); @@ -11779,15 +12547,15 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return (VT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, VT) || - isSHUFPMask(M, VT, Subtarget->hasAVX()) || + isSHUFPMask(M, VT, Subtarget->hasFp256()) || isPSHUFDMask(M, VT) || - isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) || - isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) || + isPSHUFHWMask(M, VT, Subtarget->hasInt256()) || + isPSHUFLWMask(M, VT, Subtarget->hasInt256()) || isPALIGNRMask(M, VT, Subtarget) || - isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || - isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || - isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || - isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); + isUNPCKLMask(M, VT, Subtarget->hasInt256()) || + isUNPCKHMask(M, VT, Subtarget->hasInt256()) || + isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) || + isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256())); } bool @@ -11800,8 +12568,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, if (NumElts == 4 && VT.is128BitVector()) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || - isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); + isSHUFPMask(Mask, VT, Subtarget->hasFp256()) || + isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true)); } return false; } @@ -11810,430 +12578,724 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, // X86 Scheduler Hooks //===----------------------------------------------------------------------===// -// private utility function -MachineBasicBlock * -X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpc, - unsigned immOpc, - unsigned LoadOpc, - unsigned CXchgOpc, - unsigned notOpc, - unsigned EAXreg, - const TargetRegisterClass *RC, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [bitinstr.addr] - // op t2 = t1, [bitinstr.val] - // not t3 = t2 (if Invert) - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t3 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // For the v = xbegin(), we generate + // + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - // Insert instructions into newMBB based on incoming instruction - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - DebugLoc dl = bInstr->getDebugLoc(); - MachineOperand& destOper = bInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = bInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &bInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - if (Invert) { - MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2); - } - else - t3 = t2; + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); + + // mainMBB: + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); - MIB.addReg(t1); + MI->eraseFromParent(); + return sinkMBB; +} - MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); +// Get CMPXCHG opcode for the specified data type. +static unsigned getCmpXChgOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::LCMPXCHG8; + case MVT::i16: return X86::LCMPXCHG16; + case MVT::i32: return X86::LCMPXCHG32; + case MVT::i64: return X86::LCMPXCHG64; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(EAXreg); +// Get LOAD opcode for the specified data type. +static unsigned getLoadOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::MOV8rm; + case MVT::i16: return X86::MOV16rm; + case MVT::i32: return X86::MOV32rm; + case MVT::i64: return X86::MOV64rm; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); +// Get opcode of the non-atomic one from the specified atomic instruction. +static unsigned getNonAtomicOpcode(unsigned Opc) { + switch (Opc) { + case X86::ATOMAND8: return X86::AND8rr; + case X86::ATOMAND16: return X86::AND16rr; + case X86::ATOMAND32: return X86::AND32rr; + case X86::ATOMAND64: return X86::AND64rr; + case X86::ATOMOR8: return X86::OR8rr; + case X86::ATOMOR16: return X86::OR16rr; + case X86::ATOMOR32: return X86::OR32rr; + case X86::ATOMOR64: return X86::OR64rr; + case X86::ATOMXOR8: return X86::XOR8rr; + case X86::ATOMXOR16: return X86::XOR16rr; + case X86::ATOMXOR32: return X86::XOR32rr; + case X86::ATOMXOR64: return X86::XOR64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction with +// extra opcode. +static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; + case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; + case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; + case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; + case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; + case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; + case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; + case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; + case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; + case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; + case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; + case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; + case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; + case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; + case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; + case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; + case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; + case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; + case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; + case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target. +static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { + switch (Opc) { + case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; + case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; + case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; + case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; + case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; + case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; + case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; + case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; + case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; + case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target with extra opcode. +static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, + unsigned &HiOpc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND6432: + ExtraOpc = X86::NOT32r; + HiOpc = X86::AND32rr; + return X86::AND32rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; +// Get pseudo CMOV opcode from the specified data type. +static unsigned getPseudoCMOVOpc(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::CMOV_GR8; + case MVT::i16: return X86::CMOV_GR16; + case MVT::i32: return X86::CMOV_GR32; + default: + break; + } + llvm_unreachable("Unknown CMOV opcode!"); } -// private utility function: 64 bit atomics on 32 bit host. +// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. +// They will be translated into a spin-loop or compare-exchange loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD MI.addr +// loop: +// t1 = OP MI.val, EAX +// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] +// JNE loop +// sink: +// dst = EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpcL, - unsigned regOpcH, - unsigned immOpcL, - unsigned immOpcH, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB (instructions are in pairs, except cmpxchg8b) - // ld t1,t2 = [bitinstr.addr] - // newMBB: - // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) - // op t5, t6 <- out1, out2, [bitinstr.val] - // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) - // neg t7, t8 < t5, t6 (if Invert) - // mov ECX, EBX <- t5, t6 - // mov EAX, EDX <- t1, t2 - // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] - // mov t3, t4 <- EAX, EDX - // bz newMBB - // result in out1, out2 - // fallthrough -->nextMBB - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const unsigned LoadOpc = X86::MOV32rm; - const unsigned NotOpc = X86::NOT32r; +X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg, SrcReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + MVT::SimpleValueType VT = *RC->vt_begin(); + unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); + + unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); + unsigned LOADOpc = getLoadOpcode(VT); + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // EAX = LOAD [MI.addr] + // mainMBB: + // t1 = OP MI.val, EAX + // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = bInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - // There are 8 "real" operands plus 9 implicit def/uses, ignored here. - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && - "unexpected number of operands"); - MachineOperand& dest1Oper = bInstr->getOperand(0); - MachineOperand& dest2Oper = bInstr->getOperand(1); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { - argOpers[i] = &bInstr->getOperand(i+2); - - // We use some of the operands multiple times, so conservatively just - // clear any kill flags that might be present. - if (argOpers[i]->isReg() && argOpers[i]->isUse()) - argOpers[i]->setIsKill(false); - } - - // x86 address has 5 operands: base, index, scale, displacement, and segment. - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); - // add 4 to displacement. - for (int i=0; i <= lastAddrIndx-2; ++i) - (*MIB).addOperand(*argOpers[i]); - MachineOperand newOp3 = *(argOpers[3]); - if (newOp3.isImm()) - newOp3.setImm(newOp3.getImm()+4); - else - newOp3.setOffset(newOp3.getOffset()+4); - (*MIB).addOperand(newOp3); - (*MIB).addOperand(*argOpers[lastAddrIndx]); - - // t3/4 are defined later, at the bottom of the loop - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - unsigned t4 = F->getRegInfo().createVirtualRegister(RC); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) - .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); - - // The subsequent operations should be using the destination registers of - // the PHI instructions. - t1 = dest1Oper.getReg(); - t2 = dest2Oper.getReg(); - - int valArgIndx = lastAddrIndx + 1; - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - unsigned t5 = F->getRegInfo().createVirtualRegister(RC); - unsigned t6 = F->getRegInfo().createVirtualRegister(RC); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); - if (regOpcL != X86::MOV32rr) - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); - assert(argOpers[valArgIndx + 1]->isReg() == - argOpers[valArgIndx]->isReg()); - assert(argOpers[valArgIndx + 1]->isImm() == - argOpers[valArgIndx]->isImm()); - if (argOpers[valArgIndx + 1]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); - if (regOpcH != X86::MOV32rr) - MIB.addReg(t2); - (*MIB).addOperand(*argOpers[valArgIndx + 1]); - - unsigned t7, t8; - if (Invert) { - t7 = F->getRegInfo().createVirtualRegister(RC); - t8 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6); - } else { - t7 = t5; - t8 = t6; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(AccPhyReg); + + // Copy AccPhyReg as it is used more than once. + unsigned AccReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) + .addReg(AccPhyReg); + + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op opcode!"); + case X86::ATOMAND8: + case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + case X86::ATOMOR8: + case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + case X86::ATOMXOR8: + case X86::ATOMXOR16: + case X86::ATOMXOR32: + case X86::ATOMXOR64: { + unsigned ARITHOpc = getNonAtomicOpcode(Opc); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) + .addReg(AccReg); + break; + } + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: { + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned NOTOpc; + unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); + BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) + .addReg(AccReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); + break; + } + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: { + unsigned CMPOpc; + unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); + + BuildMI(mainMBB, DL, TII->get(CMPOpc)) + .addReg(SrcReg) + .addReg(AccReg); + + if (Subtarget->hasCMov()) { + if (VT != MVT::i8) { + // Native support + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) + .addReg(SrcReg) + .addReg(AccReg); + } else { + // Promote i8 to i32 to use CMOV32 + const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); + unsigned SrcReg32 = MRI.createVirtualRegister(RC32); + unsigned AccReg32 = MRI.createVirtualRegister(RC32); + unsigned t2 = MRI.createVirtualRegister(RC32); + + unsigned Undef = MRI.createVirtualRegister(RC32); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) + .addReg(Undef) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) + .addReg(Undef) + .addReg(AccReg) + .addImm(X86::sub_8bit); + + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + .addReg(SrcReg32) + .addReg(AccReg32); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) + .addReg(t2, 0, X86::sub_8bit); + } + } else { + // Use pseudo select and lower them. + assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && + "Invalid atomic-load-op transformation!"); + unsigned SelOpc = getPseudoCMOVOpc(VT); + X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); + assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) + .addReg(SrcReg).addReg(AccReg) + .addImm(CC); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } } - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); - MIB.addReg(t2); + // Copy AccPhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) + .addReg(AccReg); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); - MIB.addReg(t7); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); - MIB.addReg(t8); + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.addReg(t1); + MIB.setMemRefs(MMOBegin, MMOEnd); - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); - MIB.addReg(X86::EAX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); - MIB.addReg(X86::EDX); + // sinkMBB: + sinkMBB->addLiveIn(AccPhyReg); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstReg) + .addReg(AccPhyReg); - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + MI->eraseFromParent(); + return sinkMBB; } -// private utility function +// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic +// instructions. They will be translated into a spin-loop or compare-exchange +// loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD [MI.addr + 0] +// EDX = LOAD [MI.addr + 4] +// loop: +// EBX = OP MI.val.lo, EAX +// ECX = OP MI.val.hi, EDX +// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// JNE loop +// sink: +// dst = EDX:EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, - MachineBasicBlock *MBB, - unsigned cmovOpc) const { - // For the atomic min/max operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [min/max.addr] - // mov t2 = [min/max.val] - // cmp t1, t2 - // cmov[cond] t2 = t1 - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t2 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB - // +X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op32 to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstLoReg, DstHiReg; + unsigned SrcLoReg, SrcHiReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstLoReg = MI->getOperand(CurOp++).getReg(); + DstHiReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcLoReg = MI->getOperand(CurOp++).getReg(); + SrcHiReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = &X86::GR32RegClass; + const TargetRegisterClass *RC8 = &X86::GR8RegClass; + + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; + unsigned LOADOpc = X86::MOV32rm; + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // EAX = LOAD [MI.addr + 0] + // EDX = LOAD [MI.addr + 4] + // mainMBB: + // EBX = OP MI.vallo, EAX + // ECX = OP MI.valhi, EDX + // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(mInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to newMBB and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = mInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - MachineOperand& destOper = mInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = mInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &mInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - // We only support register and immediate values - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - - unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); + MachineInstrBuilder MIB; - MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); - MIB.addReg(t1); - MIB.addReg(t2); + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // Lo + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Hi + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); - // Generate movc - unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); - MIB.addReg(t2); - MIB.addReg(t1); + thisMBB->addSuccessor(mainMBB); - // Cmp and exchange if none has modified the memory location - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(mInstr->memoperands_begin(), - mInstr->memoperands_end()); + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(X86::EAX); + mainMBB->addLiveIn(X86::EDX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(X86::EAX); + // Copy EDX:EAX as they are used more than once. + unsigned LoReg = MRI.createVirtualRegister(RC); + unsigned HiReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); - mInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); + case X86::ATOMAND6432: + case X86::ATOMOR6432: + case X86::ATOMXOR6432: + case X86::ATOMADD6432: + case X86::ATOMSUB6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg); + break; + } + case X86::ATOMNAND6432: { + unsigned HiOpc, NOTOpc; + unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); + break; + } + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + unsigned cL = MRI.createVirtualRegister(RC8); + unsigned cH = MRI.createVirtualRegister(RC8); + unsigned cL32 = MRI.createVirtualRegister(RC); + unsigned cH32 = MRI.createVirtualRegister(RC); + unsigned cc = MRI.createVirtualRegister(RC); + // cl := cmp src_lo, lo + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), cL); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); + // ch := cmp src_hi, hi + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), cH); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); + // cc := if (src_hi == hi) ? cl : ch; + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) + .addReg(cH32).addReg(cL32); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) + .addReg(cH32).addReg(cL32) + .addImm(X86::COND_E); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) + .addReg(SrcHiReg).addReg(HiReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) + .addReg(SrcLoReg).addReg(LoReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) + .addReg(SrcHiReg).addReg(HiReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } + case X86::ATOMSWAP6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); + break; + } + } + + // Copy EDX:EAX back from HiReg:LoReg + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); + // Copy ECX:EBX from t1H:t1L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); + + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); + + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + sinkMBB->addLiveIn(X86::EAX); + sinkMBB->addLiveIn(X86::EDX); + + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstLoReg) + .addReg(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstHiReg) + .addReg(X86::EDX); + + MI->eraseFromParent(); + return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. -MachineBasicBlock * -X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, - unsigned numArgs, bool memArg) const { - assert(Subtarget->hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); +static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; + case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; + case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; + case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; + case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; + case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; + case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; + case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; + } DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::XMM0); + + MI->eraseFromParent(); + return BB; +} + +// FIXME: Custom handling because TableGen doesn't support multiple implicit +// defs in an instruction pattern +static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { unsigned Opc; - if (!Subtarget->hasAVX()) { - if (memArg) - Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; - else - Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; - } else { - if (memArg) - Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; - else - Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; + case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; + case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; + case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; + case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; + case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; + case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; + case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } + DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - for (unsigned i = 0; i < numArgs; ++i) { - MachineOperand &Op = MI->getOperand(i+1); + + unsigned NumArgs = MI->getNumOperands(); // remove the results + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::XMM0); + .addReg(X86::ECX); MI->eraseFromParent(); return BB; } -MachineBasicBlock * -X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { +static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const X86Subtarget* Subtarget) { DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; @@ -12560,7 +13622,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MBB->addSuccessor(EndMBB); } - unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; @@ -12922,6 +13984,203 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, } MachineBasicBlock * +X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg; + unsigned MemOpndSlot = 0; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MemOpndSlot = CurOp; + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + // For v = setjmp(buf), we generate + // + // thisMBB: + // buf[LabelOffset] = restoreMBB + // SjLjSetup restoreMBB + // + // mainMBB: + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + // restoreMBB: + // v_restore = 1 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + MF->push_back(restoreMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + unsigned PtrStoreOpc = 0; + unsigned LabelReg = 0; + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + Reloc::Model RM = getTargetMachine().getRelocationModel(); + bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + + // Prepare IP either in reg or imm. + if (!UseImmLabel) { + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + LabelReg = MRI.createVirtualRegister(PtrRC); + if (Subtarget->is64Bit()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); + } else { + const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addReg(0); + } + } else + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; + // Store IP + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + if (!UseImmLabel) + MIB.addReg(LabelReg); + else + MIB.addMBB(restoreMBB); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) + .addMBB(restoreMBB); + MIB.addRegMask(RegInfo->getNoPreservedMask()); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(restoreMBB); + + // mainMBB: + // EAX = 0 + BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(restoreMBB); + + // restoreMBB: + BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + restoreMBB->addSuccessor(sinkMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; + unsigned SP = RegInfo->getStackRegister(); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + + unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; + unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; + + // Reload FP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload IP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), LabelOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload SP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), SPOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Jump + BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { @@ -13050,156 +14309,73 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: { - unsigned NumArgs; - bool MemArg; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRM128REG: - case X86::VPCMPISTRM128REG: - NumArgs = 3; MemArg = false; break; - case X86::PCMPISTRM128MEM: - case X86::VPCMPISTRM128MEM: - NumArgs = 3; MemArg = true; break; - case X86::PCMPESTRM128REG: - case X86::VPCMPESTRM128REG: - NumArgs = 5; MemArg = false; break; - case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - NumArgs = 5; MemArg = true; break; - } - return EmitPCMP(MI, BB, NumArgs, MemArg); - } - - // Thread synchronization. + case X86::VPCMPESTRM128MEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); + + // String/text processing lowering. + case X86::PCMPISTRIREG: + case X86::VPCMPISTRIREG: + case X86::PCMPISTRIMEM: + case X86::VPCMPISTRIMEM: + case X86::PCMPESTRIREG: + case X86::VPCMPESTRIREG: + case X86::PCMPESTRIMEM: + case X86::VPCMPESTRIMEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); + + // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB); + return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); - // Atomic Lowering. - case X86::ATOMMIN32: - case X86::ATOMMAX32: - case X86::ATOMUMIN32: - case X86::ATOMUMAX32: - case X86::ATOMMIN16: - case X86::ATOMMAX16: - case X86::ATOMUMIN16: - case X86::ATOMUMAX16: - case X86::ATOMMIN64: - case X86::ATOMMAX64: - case X86::ATOMUMIN64: - case X86::ATOMUMAX64: { - unsigned Opc; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMMIN32: Opc = X86::CMOVL32rr; break; - case X86::ATOMMAX32: Opc = X86::CMOVG32rr; break; - case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break; - case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break; - case X86::ATOMMIN16: Opc = X86::CMOVL16rr; break; - case X86::ATOMMAX16: Opc = X86::CMOVG16rr; break; - case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break; - case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break; - case X86::ATOMMIN64: Opc = X86::CMOVL64rr; break; - case X86::ATOMMAX64: Opc = X86::CMOVG64rr; break; - case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break; - case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break; - // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. - } - return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc); - } - - case X86::ATOMAND32: - case X86::ATOMOR32: - case X86::ATOMXOR32: - case X86::ATOMNAND32: { - bool Invert = false; - unsigned RegOpc, ImmOpc; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMAND32: - RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break; - case X86::ATOMOR32: - RegOpc = X86::OR32rr; ImmOpc = X86::OR32ri; break; - case X86::ATOMXOR32: - RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break; - case X86::ATOMNAND32: - RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break; - } - return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, - X86::MOV32rm, X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass, Invert); - } + // xbegin + case X86::XBEGIN: + return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); + // Atomic Lowering. + case X86::ATOMAND8: case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + // Fall through + case X86::ATOMOR8: case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + // Fall through case X86::ATOMXOR16: - case X86::ATOMNAND16: { - bool Invert = false; - unsigned RegOpc, ImmOpc; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMAND16: - RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break; - case X86::ATOMOR16: - RegOpc = X86::OR16rr; ImmOpc = X86::OR16ri; break; - case X86::ATOMXOR16: - RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break; - case X86::ATOMNAND16: - RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break; - } - return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, - X86::MOV16rm, X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass, Invert); - } - - case X86::ATOMAND8: - case X86::ATOMOR8: case X86::ATOMXOR8: - case X86::ATOMNAND8: { - bool Invert = false; - unsigned RegOpc, ImmOpc; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMAND8: - RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break; - case X86::ATOMOR8: - RegOpc = X86::OR8rr; ImmOpc = X86::OR8ri; break; - case X86::ATOMXOR8: - RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break; - case X86::ATOMNAND8: - RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break; - } - return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, - X86::MOV8rm, X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass, Invert); - } - - // This group is for 64-bit host. - case X86::ATOMAND64: - case X86::ATOMOR64: + case X86::ATOMXOR32: case X86::ATOMXOR64: - case X86::ATOMNAND64: { - bool Invert = false; - unsigned RegOpc, ImmOpc; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMAND64: - RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break; - case X86::ATOMOR64: - RegOpc = X86::OR64rr; ImmOpc = X86::OR64ri32; break; - case X86::ATOMXOR64: - RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break; - case X86::ATOMNAND64: - RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break; - } - return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, - X86::MOV64rm, X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass, Invert); - } + // Fall through + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: + // Fall through + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + // Fall through + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + // Fall through + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + // Fall through + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: + return EmitAtomicLoadArith(MI, BB); // This group does 64-bit operations on a 32-bit host. case X86::ATOMAND6432: @@ -13208,50 +14384,26 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::ATOMNAND6432: case X86::ATOMADD6432: case X86::ATOMSUB6432: - case X86::ATOMSWAP6432: { - bool Invert = false; - unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::ATOMAND6432: - RegOpcL = RegOpcH = X86::AND32rr; - ImmOpcL = ImmOpcH = X86::AND32ri; - break; - case X86::ATOMOR6432: - RegOpcL = RegOpcH = X86::OR32rr; - ImmOpcL = ImmOpcH = X86::OR32ri; - break; - case X86::ATOMXOR6432: - RegOpcL = RegOpcH = X86::XOR32rr; - ImmOpcL = ImmOpcH = X86::XOR32ri; - break; - case X86::ATOMNAND6432: - RegOpcL = RegOpcH = X86::AND32rr; - ImmOpcL = ImmOpcH = X86::AND32ri; - Invert = true; - break; - case X86::ATOMADD6432: - RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr; - ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri; - break; - case X86::ATOMSUB6432: - RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr; - ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri; - break; - case X86::ATOMSWAP6432: - RegOpcL = RegOpcH = X86::MOV32rr; - ImmOpcL = ImmOpcH = X86::MOV32ri; - break; - } - return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH, - ImmOpcL, ImmOpcH, Invert); - } + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: + case X86::ATOMSWAP6432: + return EmitAtomicLoadArith6432(MI, BB); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); + + case X86::EH_SjLj_SetJmp32: + case X86::EH_SjLj_SetJmp64: + return emitEHSjLjSetJmp(MI, BB); + + case X86::EH_SjLj_LongJmp32: + case X86::EH_SjLj_LongJmp64: + return emitEHSjLjLongJmp(MI, BB); } } @@ -13434,6 +14586,18 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, Ld->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as Ld in + // terms of dependency. We create a TokenFactor for Ld and ResNode, + // and update uses of Ld's output chain to use the TokenFactor. + if (Ld->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); } } @@ -13479,7 +14643,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasAVX() && VT.is256BitVector() && + if (Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -13497,127 +14661,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } - -/// DCI, PerformTruncateCombine - Converts truncate operation to +/// PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector - -SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, - DAGCombinerInfo &DCI) const { - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (!Subtarget->hasAVX()) - return SDValue(); - - EVT VT = N->getValueType(0); - SDValue Op = N->getOperand(0); - EVT OpVT = Op.getValueType(); - DebugLoc dl = N->getDebugLoc(); - - if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { - - if (Subtarget->hasAVX2()) { - // AVX2: v4i64 -> v4i32 - - // VPERMD - static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; - - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); - Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), - ShufMask); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, - DAG.getIntPtrConstant(0)); - } - - // AVX: v4i64 -> v4i32 - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(0)); - - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(2)); - - OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); - - // PSHUFD - static const int ShufMask1[] = {0, 2, 0, 0}; - - SDValue Undef = DAG.getUNDEF(VT); - OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1); - - // MOVLHPS - static const int ShufMask2[] = {0, 1, 4, 5}; - - return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); - } - - if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { - - if (Subtarget->hasAVX2()) { - // AVX2: v8i32 -> v8i16 - - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); - - // PSHUFB - SmallVector<SDValue,32> pshufbMask; - for (unsigned i = 0; i < 2; ++i) { - pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); - for (unsigned j = 0; j < 8; ++j) - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, - &pshufbMask[0], 32); - Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); - - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); - - static const int ShufMask[] = {0, 2, -1, -1}; - Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), - &ShufMask[0]); - - Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(0)); - - return DAG.getNode(ISD::BITCAST, dl, VT, Op); - } - - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(0)); - - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(4)); - - OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); - - // PSHUFB - static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1}; - - SDValue Undef = DAG.getUNDEF(MVT::v16i8); - OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1); - - OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); - - // MOVLHPS - static const int ShufMask2[] = {0, 1, 4, 5}; - - SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); - } - +static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { return SDValue(); } @@ -13695,7 +14744,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getTargetData()-> + unsigned NewAlign = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) @@ -13726,6 +14775,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); + // Detect whether we are trying to convert from mmx to i32 and the bitcast + // from mmx to v2i32 has a single usage. + if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && + InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && + InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. @@ -13804,6 +14861,76 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. +static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (!VT.isVector()) + return 0; + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + if (!Subtarget->hasAVX2()) + return 0; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + if (!Subtarget->hasSSE2()) + return 0; + } + + // SSE2 has only a small subset of the operations. + bool hasUnsigned = Subtarget->hasSSE41() || + (Subtarget->hasSSE2() && VT == MVT::v16i8); + bool hasSigned = Subtarget->hasSSE41() || + (Subtarget->hasSSE2() && VT == MVT::v8i16); + + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Check for x CC y ? x : y. + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + switch (CC) { + default: break; + case ISD::SETULT: + case ISD::SETULE: + return hasUnsigned ? X86ISD::UMIN : 0; + case ISD::SETUGT: + case ISD::SETUGE: + return hasUnsigned ? X86ISD::UMAX : 0; + case ISD::SETLT: + case ISD::SETLE: + return hasSigned ? X86ISD::SMIN : 0; + case ISD::SETGT: + case ISD::SETGE: + return hasSigned ? X86ISD::SMAX : 0; + } + // Check for x CC y ? y : x -- a min/max with reversed arms. + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { + switch (CC) { + default: break; + case ISD::SETULT: + case ISD::SETULE: + return hasUnsigned ? X86ISD::UMAX : 0; + case ISD::SETUGT: + case ISD::SETUGE: + return hasUnsigned ? X86ISD::UMIN : 0; + case ISD::SETLT: + case ISD::SETLE: + return hasSigned ? X86ISD::SMAX : 0; + case ISD::SETGT: + case ISD::SETGE: + return hasSigned ? X86ISD::SMIN : 0; + } + } + + return 0; +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -14084,6 +15211,71 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Match VSELECTs into subs with unsigned saturation. + if (!DCI.isBeforeLegalize() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. + ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Check if one of the arms of the VSELECT is a zero vector. If it's on the + // left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { + Other = LHS; + } + + if (Other.getNode() && Other->getNumOperands() == 2 && + DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { + SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); + SDValue CondRHS = Cond->getOperand(1); + + // Look for a general sub with unsigned saturation first. + // x >= y ? x-y : 0 --> subus x, y + // x > y ? x-y : 0 --> subus x, y + if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && + Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + + // If the RHS is a constant we have to reverse the const canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { + APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); + if (CondRHS.getConstantOperandVal(0) == -A-1) { + SmallVector<SDValue, 32> V(VT.getVectorNumElements(), + DAG.getConstant(-A, VT.getScalarType())); + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, + V.data(), V.size())); + } + } + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use ComputeMaskedBits to determine whether + // it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + isSplatVector(OpRHS.getNode())) { + APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); + if (A.isSignBit()) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + } + } + } + + // Try to match a min/max vector operation. + if (!DCI.isBeforeLegalize() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) + if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) + return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -14207,84 +15399,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); } -/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS -/// updated. If only flag result is used and the result is evaluated from a -/// series of element extraction, try to combine it into a PTEST. -static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC, - SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDNode *N = Or.getNode(); - DebugLoc DL = N->getDebugLoc(); - - // Only SSE4.1 and beyond supports PTEST or like. - if (!Subtarget->hasSSE41()) - return SDValue(); - - if (N->getOpcode() != X86ISD::OR) - return SDValue(); - - // Quit if the value result of OR is used. - if (N->hasAnyUseOfValue(0)) - return SDValue(); - - // Quit if not used as a boolean value. - if (CC != X86::COND_E && CC != X86::COND_NE) - return SDValue(); - - SmallVector<SDValue, 8> Opnds; - SDValue VecIn; - EVT VT = MVT::Other; - unsigned Mask = 0; - - // Recognize a special case where a vector is casted into wide integer to - // test all 0s. - Opnds.push_back(N->getOperand(0)); - Opnds.push_back(N->getOperand(1)); - - for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { - SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; - // BFS traverse all OR'd operands. - if (I->getOpcode() == ISD::OR) { - Opnds.push_back(I->getOperand(0)); - Opnds.push_back(I->getOperand(1)); - // Re-evaluate the number of nodes to be traversed. - e += 2; // 2 more nodes (LHS and RHS) are pushed. - continue; - } - - // Quit if a non-EXTRACT_VECTOR_ELT - if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Quit if without a constant index. - SDValue Idx = I->getOperand(1); - if (!isa<ConstantSDNode>(Idx)) - return SDValue(); - - // Check if all elements are extracted from the same vector. - SDValue ExtractedFromVec = I->getOperand(0); - if (VecIn.getNode() == 0) { - VT = ExtractedFromVec.getValueType(); - // FIXME: only 128-bit vector is supported so far. - if (!VT.is128BitVector()) - return SDValue(); - VecIn = ExtractedFromVec; - } else if (VecIn != ExtractedFromVec) - return SDValue(); - - // Record the constant index. - Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); - } - - assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far."); - - // Quit if not all elements are used. - if (Mask != (1U << VT.getVectorNumElements()) - 1U) - return SDValue(); - - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn); -} - /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -14323,14 +15437,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, Ops, array_lengthof(Ops)); } - Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Ops[] = { FalseOp, TrueOp, - DAG.getConstant(CC, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), - Ops, array_lengthof(Ops)); - } - // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. @@ -14341,6 +15447,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); + std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. @@ -14423,10 +15530,49 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } } + + // Handle these cases: + // (select (x != c), e, c) -> select (x != c), e, x), + // (select (x == c), c, e) -> select (x == c), x, e) + // where the c is an integer constant, and the "select" is the combination + // of CMOV and CMP. + // + // The rationale for this change is that the conditional-move from a constant + // needs two instructions, however, conditional-move from a register needs + // only one instruction. + // + // CAVEAT: By replacing a constant with a symbolic value, it may obscure + // some instruction-combining opportunities. This opt needs to be + // postponed as late as possible. + // + if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { + // the DCI.xxxx conditions are provided to postpone the optimization as + // late as possible. + + ConstantSDNode *CmpAgainst = 0; + if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && + (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && + dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) { + + if (CC == X86::COND_NE && + CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueOp, FalseOp); + } + + if (CC == X86::COND_E && + CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { + SDValue Ops[] = { FalseOp, Cond.getOperand(0), + DAG.getConstant(CC, MVT::i8), Cond }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, + array_lengthof(Ops)); + } + } + } + return SDValue(); } - /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. @@ -14515,7 +15661,6 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { } } - // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. @@ -14553,7 +15698,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return SDValue(); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && - (!Subtarget->hasAVX2() || + (!Subtarget->hasInt256() || (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); @@ -14659,7 +15804,6 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } } - // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) // where both setccs reference the same FP CMP, and rewrite for CMPEQSS // and friends. Likewise for OR -> CMPNEQSS. @@ -14768,9 +15912,92 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) { return false; } +// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized +// register. In most cases we actually compare or select YMM-sized registers +// and mixing the two types creates horrible code. This method optimizes +// some of the transition sequences. +static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.getSizeInBits() != 256) + return SDValue(); + + assert((N->getOpcode() == ISD::ANY_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); + + SDValue Narrow = N->getOperand(0); + EVT NarrowVT = Narrow->getValueType(0); + if (NarrowVT.getSizeInBits() != 128) + return SDValue(); + + if (Narrow->getOpcode() != ISD::XOR && + Narrow->getOpcode() != ISD::AND && + Narrow->getOpcode() != ISD::OR) + return SDValue(); + + SDValue N0 = Narrow->getOperand(0); + SDValue N1 = Narrow->getOperand(1); + DebugLoc DL = Narrow->getDebugLoc(); + + // The Left side has to be a trunc. + if (N0.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + // The type of the truncated inputs. + EVT WideVT = N0->getOperand(0)->getValueType(0); + if (WideVT != VT) + return SDValue(); + + // The right side has to be a 'trunc' or a constant vector. + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; + bool RHSConst = (isSplatVector(N1.getNode()) && + isa<ConstantSDNode>(N1->getOperand(0))); + if (!RHSTrunc && !RHSConst) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) + return SDValue(); + + // Set N0 and N1 to hold the inputs to the new wide operation. + N0 = N0->getOperand(0); + if (RHSConst) { + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), + N1->getOperand(0)); + SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); + } else if (RHSTrunc) { + N1 = N1->getOperand(0); + } + + // Generate the wide operation. + SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + case ISD::ANY_EXTEND: + return Op; + case ISD::ZERO_EXTEND: { + unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); + APInt Mask = APInt::getAllOnesValue(InBits); + Mask = Mask.zext(VT.getScalarType().getSizeInBits()); + return DAG.getNode(ISD::AND, DL, VT, + Op, DAG.getConstant(Mask, VT)); + } + case ISD::SIGN_EXTEND: + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, + Op, DAG.getValueType(NarrowVT)); + default: + llvm_unreachable("Unexpected opcode"); + } +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -14778,9 +16005,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - EVT VT = N->getValueType(0); - - // Create ANDN, BLSI, and BLSR instructions + // Create BLSI, and BLSR instructions // BLSI is X & (-X) // BLSR is X & (X-1) if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { @@ -14788,13 +16013,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); DebugLoc DL = N->getDebugLoc(); - // Check LHS for not - if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) - return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); - // Check RHS for not - if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) - return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); - // Check LHS for neg if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && isZero(N0.getOperand(0))) @@ -14847,6 +16065,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -14854,15 +16073,13 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // look for psign/blend if (VT == MVT::v2i64 || VT == MVT::v4i64) { if (!Subtarget->hasSSSE3() || - (VT == MVT::v4i64 && !Subtarget->hasAVX2())) + (VT == MVT::v4i64 && !Subtarget->hasInt256())) return SDValue(); // Canonicalize pandn to RHS @@ -14908,6 +16125,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, DebugLoc DL = N->getDebugLoc(); + // We are going to replace the AND, OR, NAND with either BLEND + // or PSIGN, which only look at the MSB. The VSRAI instruction + // does not affect the highest bit, so we can get rid of it. + Mask = Mask.getOperand(0); + // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); @@ -14916,7 +16138,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); - Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } // PBLENDVB only available on SSE 4.1 @@ -15030,6 +16252,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -15043,8 +16266,6 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasBMI()) return SDValue(); - EVT VT = N->getValueType(0); - if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); @@ -15079,11 +16300,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::LoadExtType Ext = Ld->getExtensionType(); // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. We need SSE4 for the shuffles. + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. + // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll + // emit a shuffle and a arithmetic shift. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && RegVT.isInteger() && - Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && + (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -15092,6 +16316,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) + return SDValue(); + // All sizes must be a power of two. if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); @@ -15115,16 +16342,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + if (Ext == ISD::SEXTLOAD && NumLoads > 1) + return SDValue(); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, - RegSz/SclrLoadTy.getSizeInBits()); + loadRegZize/SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - RegSz/MemVT.getScalarType().getSizeInBits()); + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize/MemVT.getScalarType().getSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -15165,6 +16399,41 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); unsigned SizeRatio = RegSz/MemSz; + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1 we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + return DCI.CombineTo(N, Sext, TF, true); + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) + return SDValue(); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio + SizeRatio-1] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + SmallVector<SDValue, 8> C(NumElems, + DAG.getConstant(Amt, RegVT.getScalarType())); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size()); + Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV); + + return DCI.CombineTo(N, Shuff, TF, true); + } + // Redistribute the loaded elements into the different locations. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) @@ -15198,7 +16467,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. - if (VT.is256BitVector() && !Subtarget->hasAVX2() && + if (VT.is256BitVector() && !Subtarget->hasInt256() && StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && StoredVal.getNumOperands() == 2) { SDValue Value0 = StoredVal.getOperand(0); @@ -15298,7 +16567,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Chains.size()); } - // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right @@ -15309,7 +16577,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || @@ -15545,7 +16814,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -15560,7 +16829,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal subs from subs of shuffles. if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -15603,7 +16872,6 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } - /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 @@ -15655,52 +16923,16 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) return SDValue(); - if (!Subtarget->hasAVX()) + if (!Subtarget->hasFp256()) return SDValue(); EVT VT = N->getValueType(0); - SDValue Op = N->getOperand(0); - EVT OpVT = Op.getValueType(); - DebugLoc dl = N->getDebugLoc(); - - if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || - (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { - - if (Subtarget->hasAVX2()) - return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); - - // Optimize vectors in AVX mode - // Sign extend v8i16 to v8i32 and - // v4i32 to v4i64 - // - // Divide input vector into two parts - // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} - // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 - // concat the vectors to original VT - - unsigned NumElems = OpVT.getVectorNumElements(); - SDValue Undef = DAG.getUNDEF(OpVT); - - SmallVector<int,8> ShufMask1(NumElems, -1); - for (unsigned i = 0; i != NumElems/2; ++i) - ShufMask1[i] = i; - - SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); - - SmallVector<int,8> ShufMask2(NumElems, -1); - for (unsigned i = 0; i != NumElems/2; ++i) - ShufMask2[i] = i + NumElems/2; - - SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); - - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), - VT.getVectorNumElements()/2); - - OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + if (VT.isVector() && VT.getSizeInBits() == 256) { + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; } + return SDValue(); } @@ -15754,58 +16986,26 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = N->getDebugLoc(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - EVT OpVT = N0.getValueType(); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() != X86ISD::SETCC_CARRY) - return SDValue(); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) - return SDValue(); - return DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, - N00.getOperand(0), N00.getOperand(1)), - DAG.getConstant(1, VT)); + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!C || C->getZExtValue() != 1) + return SDValue(); + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, VT)); + } } - // Optimize vectors in AVX mode: - // - // v8i16 -> v8i32 - // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. - // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. - // Concat upper and lower parts. - // - // v4i32 -> v4i64 - // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. - // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. - // Concat upper and lower parts. - // - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (!Subtarget->hasAVX()) - return SDValue(); - - if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || - ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { - - if (Subtarget->hasAVX2()) - return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); - - SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); - SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); - SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); - - EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements()/2); - - OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + if (VT.isVector() && VT.getSizeInBits() == 256) { + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; } return SDValue(); @@ -15837,6 +17037,16 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces +// an all-ones bit which is more useful than 0/1 in some cases. +static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { + return DAG.getNode(ISD::AND, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), + DAG.getConstant(1, MVT::i8)); +} + // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -15845,14 +17055,29 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); + if (CC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return MaterializeSETB(DL, NewEFLAGS, DAG); + } + } + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) - return DAG.getNode(ISD::AND, DL, MVT::i8, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(CC, MVT::i8), EFLAGS), - DAG.getConstant(1, MVT::i8)); + return MaterializeSETB(DL, EFLAGS, DAG); SDValue Flags; @@ -15862,12 +17087,6 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } - Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); - return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); - } - return SDValue(); } @@ -15891,30 +17110,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, Flags); } - Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); - if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); - return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, - Flags); - } - - return SDValue(); -} - -static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { - SDValue Op0 = N->getOperand(0); - EVT InVT = Op0->getValueType(0); - - // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); - MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; - SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); - // Notice that we use SINT_TO_FP because we know that the high bits - // are zero and SINT_TO_FP is better supported by the hardware. - return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); - } - return SDValue(); } @@ -15949,20 +17144,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() - if (VT == MVT::v8i8 || VT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); - MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; - SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, I); - } - - return SDValue(); -} - // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -16037,7 +17218,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); @@ -16070,13 +17251,28 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. EVT VT = N->getValueType(0); if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } +/// performVZEXTCombine - Performs build vector combines +static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue In = N->getOperand(0); + while (In.getOpcode() == ISD::BITCAST) + In = In.getOperand(0); + + if (In.getOpcode() != X86ISD::VZEXT) + return SDValue(); + + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16099,9 +17295,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); - case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); - case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -16114,10 +17308,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: case X86ISD::UNPCKH: @@ -16346,8 +17541,6 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } - - /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. X86TargetLowering::ConstraintType @@ -16424,17 +17617,17 @@ TargetLowering::ConstraintWeight case 'f': case 't': case 'u': - if (type->isFloatingPointTy()) - weight = CW_SpecificReg; - break; + if (type->isFloatingPointTy()) + weight = CW_SpecificReg; + break; case 'y': - if (type->isX86_MMXTy() && Subtarget->hasMMX()) - weight = CW_SpecificReg; - break; + if (type->isX86_MMXTy() && Subtarget->hasMMX()) + weight = CW_SpecificReg; + break; case 'x': case 'Y': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || - ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) weight = CW_Register; break; case 'I': @@ -16545,7 +17738,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'K': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - if ((int8_t)C->getSExtValue() == C->getSExtValue()) { + if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 74f5167..16ce364 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -15,14 +15,14 @@ #ifndef X86ISELLOWERING_H #define X86ISELLOWERING_H -#include "X86Subtarget.h" -#include "X86RegisterInfo.h" #include "X86MachineFunctionInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetOptions.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" namespace llvm { namespace X86ISD { @@ -142,6 +142,10 @@ namespace llvm { /// mnemonic, so do I; blame Intel. MOVDQ2Q, + /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, @@ -171,13 +175,14 @@ namespace llvm { /// PSIGN - Copy integer sign. PSIGN, - /// BLENDV - Blend where the selector is an XMM. + /// BLENDV - Blend where the selector is a register. BLENDV, - /// BLENDxx - Blend where the selector is an immediate. - BLENDPW, - BLENDPS, - BLENDPD, + /// BLENDI - Blend where the selector is an immediate. + BLENDI, + + // SUBUS - Integer sub with unsigned saturation. + SUBUS, /// HADD - Integer horizontal add. HADD, @@ -191,6 +196,12 @@ namespace llvm { /// FHSUB - Floating point horizontal sub. FHSUB, + /// UMAX, UMIN - Unsigned integer max and min. + UMAX, UMIN, + + /// SMAX, SMIN - Signed integer max and min. + SMAX, SMIN, + /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, @@ -217,6 +228,12 @@ namespace llvm { // EH_RETURN - Exception Handling helpers. EH_RETURN, + // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + /// TC_RETURN - Tail call return. /// operand #0 chain /// operand #1 callee (register or absolute) @@ -230,9 +247,18 @@ namespace llvm { // VSEXT_MOVL - Vector move low and sign extend. VSEXT_MOVL, + // VZEXT - Vector integer zero-extend. + VZEXT, + + // VSEXT - Vector integer signed-extend. + VSEXT, + // VFPEXT - Vector FP extend. VFPEXT, + // VFPROUND - Vector FP round. + VFPROUND, + // VSHL, VSRL - 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -252,8 +278,6 @@ namespace llvm { ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - ANDN, // ANDN - Bitwise AND NOT with FLAGS results. - BLSI, // BLSI - Extract lowest set isolated bit BLSMSK, // BLSMSK - Get mask up to lowest set bit BLSR, // BLSR - Reset lowest set bit @@ -348,6 +372,10 @@ namespace llvm { ATOMXOR64_DAG, ATOMAND64_DAG, ATOMNAND64_DAG, + ATOMMAX64_DAG, + ATOMMIN64_DAG, + ATOMUMAX64_DAG, + ATOMUMIN64_DAG, ATOMSWAP64_DAG, // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. @@ -461,10 +489,6 @@ namespace llvm { getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const; - /// getStackPtrReg - Return the stack pointer register we are using: either - /// ESP or RSP. - unsigned getStackPtrReg() const { return X86StackPtr; } - /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contains are placed at 16-byte boundaries while the rest are at @@ -476,23 +500,29 @@ namespace llvm { /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, - /// probably because the source does not need to be loaded. If - /// 'IsZeroVal' is true, that means it's safe to return a - /// non-scalar-integer type, e.g. empty string source, constant, or loaded - /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is - /// constant so it does not need to be loaded. + /// probably because the source does not need to be loaded. If 'IsMemset' is + /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that + /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy + /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. virtual EVT - getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsZeroVal, bool MemcpyStrSrc, + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const; + /// isSafeMemOpType - Returns true if it's safe to use load / store of the + /// specified type to expand memcpy / memset inline. This is mostly true + /// for all types except for some special cases. For example, on X86 + /// targets without SSE2 f64 load / store are done with fldl / fstpl which + /// also does type conversion. Note the specified type doesn't have to be + /// legal as the hook is used before type legalization. + virtual bool isSafeMemOpType(MVT VT) const; + /// allowsUnalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses. of the specified type. - virtual bool allowsUnalignedMemoryAccesses(EVT VT) const { - return true; - } + /// unaligned memory accesses. of the specified type. Returns whether it + /// is "fast" by reference in the second argument. + virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const; /// LowerOperation - Provide custom lowering hooks for some operations. /// @@ -610,6 +640,7 @@ namespace llvm { /// result out to 64 bits. virtual bool isZExtFree(Type *Ty1, Type *Ty2) const; virtual bool isZExtFree(EVT VT1, EVT VT2) const; + virtual bool isZExtFree(SDValue Val, EVT VT2) const; /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to @@ -690,17 +721,14 @@ namespace llvm { protected: std::pair<const TargetRegisterClass*, uint8_t> - findRepresentativeClass(EVT VT) const; + findRepresentativeClass(MVT VT) const; private: /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; const X86RegisterInfo *RegInfo; - const TargetData *TD; - - /// X86StackPtr - X86 physical register used as stack ptr. - unsigned X86StackPtr; + const DataLayout *TD; /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. @@ -744,6 +772,7 @@ namespace llvm { bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, + Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -763,15 +792,11 @@ namespace llvm { SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, @@ -785,12 +810,17 @@ namespace llvm { SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, DebugLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -802,39 +832,26 @@ namespace llvm { SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const; - // Utility functions to help LowerVECTOR_SHUFFLE - SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const; + // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR + SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const; SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; + SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const; + SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const; virtual SDValue LowerFormalArguments(SDValue Chain, @@ -857,9 +874,8 @@ namespace llvm { virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; - virtual EVT - getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, - ISD::NodeType ExtendKind) const; + virtual MVT + getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const; virtual bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, @@ -867,51 +883,17 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const; - /// Utility function to emit string processing sse4.2 instructions - /// that return in xmm0. - /// This takes the instruction to expand, the associated machine basic - /// block, the number of args, and whether or not the second arg is - /// in memory or not. - MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB, - unsigned argNum, bool inMem) const; - - /// Utility functions to emit monitor and mwait instructions. These - /// need to make sure that the arguments to the intrinsic are in the - /// correct registers. - MachineBasicBlock *EmitMonitor(MachineInstr *MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const; - - /// Utility function to emit atomic bitwise operations (and, or, xor). - /// It takes the bitwise instruction to expand, the associated machine basic - /// block, and the associated X86 opcodes for reg/reg and reg/imm. - MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter( - MachineInstr *BInstr, - MachineBasicBlock *BB, - unsigned regOpc, - unsigned immOpc, - unsigned loadOpc, - unsigned cxchgOpc, - unsigned notOpc, - unsigned EAXreg, - const TargetRegisterClass *RC, - bool Invert = false) const; - - MachineBasicBlock *EmitAtomicBit6432WithCustomInserter( - MachineInstr *BInstr, - MachineBasicBlock *BB, - unsigned regOpcL, - unsigned regOpcH, - unsigned immOpcL, - unsigned immOpcH, - bool Invert = false) const; - - /// Utility function to emit atomic min and max. It takes the min/max - /// instruction to expand, the associated basic block, and the associated - /// cmov opcode for moving the min or max value. - MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr, - MachineBasicBlock *BB, - unsigned cmovOpc) const; + /// Utility function to emit atomic-load-arith operations (and, or, xor, + /// nand, max, min, umax, umin). It takes the corresponding instruction to + /// expand, the associated machine basic block, and the associated X86 + /// opcodes for reg/reg. + MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit atomic-load-arith operations (and, or, xor, + /// nand, add, sub, swap) for 64-bit operands on 32-bit target. + MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const; // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock *EmitVAARG64WithCustomInserter( @@ -939,6 +921,12 @@ namespace llvm { MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent, for use with the given x86 condition code. SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f790611..0eecd5f 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -57,7 +57,7 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), - "mul{w}\t$src", + "mul{w}\t$src", [], IIC_MUL16_REG>, OpSize; // AX,DX = AX*GR16 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in @@ -158,7 +158,7 @@ def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), (X86smul_flag GR16:$src1, (load addr:$src2)))], IIC_IMUL16_RM>, TB, OpSize; -def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), +def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "imul{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, EFLAGS, @@ -182,8 +182,8 @@ let Defs = [EFLAGS] in { def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, imm:$src2))], + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))], IIC_IMUL16_RRI>, OpSize; def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), @@ -266,6 +266,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 // unsigned division/remainder +let hasSideEffects = 1 in { // so that we don't speculatively execute let Defs = [AL,EFLAGS,AX], Uses = [AX] in def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH "div{b}\t$src", [], IIC_DIV8_REG>; @@ -319,12 +320,13 @@ let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX "idiv{w}\t$src", [], IIC_IDIV16>, OpSize; let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX -def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), +def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), "idiv{l}\t$src", [], IIC_IDIV32>; let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), "idiv{q}\t$src", [], IIC_IDIV64>; } +} // hasSideEffects = 0 //===----------------------------------------------------------------------===// // Two address Instructions. @@ -412,11 +414,11 @@ def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), IIC_UNARY_REG>; let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, OpSize, Requires<[In32BitMode]>; -def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), +def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], IIC_UNARY_REG>, @@ -430,22 +432,22 @@ def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", // In 64-bit mode, single byte INC and DEC cannot be encoded. let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can transform into LEA. -def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), +def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, OpSize, Requires<[In64BitMode]>; -def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), +def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], IIC_UNARY_REG>, Requires<[In64BitMode]>; -def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), +def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], IIC_UNARY_REG>, OpSize, Requires<[In64BitMode]>; -def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), +def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], IIC_UNARY_REG>, @@ -469,7 +471,7 @@ let CodeSize = 2 in { def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; - + // These are duplicates of their 32-bit counterparts. Only needed so X86 knows // how to unfold them. // FIXME: What is this for?? @@ -498,12 +500,12 @@ def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], IIC_UNARY_REG>; let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], IIC_UNARY_REG>, OpSize, Requires<[In32BitMode]>; -def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), +def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], IIC_UNARY_REG>, @@ -544,57 +546,57 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> { /// VT - This is the value type itself. ValueType VT = vt; - + /// InstrSuffix - This is the suffix used on instructions with this type. For /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q". string InstrSuffix = instrsuffix; - + /// RegClass - This is the register class associated with this type. For /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64. RegisterClass RegClass = regclass; - + /// LoadNode - This is the load node associated with this type. For /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64. PatFrag LoadNode = loadnode; - + /// MemOperand - This is the memory operand associated with this type. For /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem. X86MemOperand MemOperand = memoperand; - + /// ImmEncoding - This is the encoding of an immediate of this type. For /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32 /// since the immediate fields of i64 instructions is a 32-bit sign extended /// value. ImmType ImmEncoding = immkind; - + /// ImmOperand - This is the operand kind of an immediate of this type. For /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 -> /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign /// extended value. Operand ImmOperand = immoperand; - + /// ImmOperator - This is the operator that should be used to match an /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32). SDPatternOperator ImmOperator = immoperator; - + /// Imm8Operand - This is the operand kind to use for an imm8 of this type. /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is /// only used for instructions that have a sign-extended imm8 field form. Operand Imm8Operand = imm8operand; - + /// Imm8Operator - This is the operator that should be used to match an 8-bit /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8). SDPatternOperator Imm8Operator = imm8operator; - + /// HasOddOpcode - This bit is true if the instruction should have an odd (as /// opposed to even) opcode. Operations on i8 are usually even, operations on /// other datatypes are odd. bit HasOddOpcode = hasOddOpcode; - + /// HasOpSizePrefix - This bit is set to true if the instruction should have /// the 0x66 operand size prefix. This is set for i16 types. bit HasOpSizePrefix = hasOpSizePrefix; - + /// HasREX_WPrefix - This bit is set to true if the instruction should have /// the 0x40 REX prefix. This is set for i64 types. bit HasREX_WPrefix = hasREX_WPrefix; @@ -624,12 +626,12 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, /// 3. Infers whether the instruction should have a 0x40 REX_W prefix. /// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) /// or 1 (for i16,i32,i64 operations). -class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, +class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, string mnemonic, string args, list<dag> pattern, InstrItinClass itin = IIC_BIN_NONMEM> : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, - f, outs, ins, + f, outs, ins, !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern, itin> { @@ -690,6 +692,7 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM> { // The disassembler should know about this, but not the asmparser. let isCodeGenOnly = 1; + let hasSideEffects = 0; } // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). @@ -699,6 +702,7 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM> { // The disassembler should know about this, but not the asmparser. let isCodeGenOnly = 1; + let hasSideEffects = 0; } // BinOpRM - Instructions like "add reg, reg, [mem]". @@ -764,13 +768,13 @@ class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, EFLAGS, + [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; // BinOpRI_RFF - Instructions like "adc reg, reg, imm". class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, EFLAGS, + [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, EFLAGS))]>; @@ -789,7 +793,7 @@ class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; - + // BinOpRI8_F - Instructions like "cmp reg, imm8". class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> @@ -852,14 +856,14 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo, // BinOpMI_RMW - Instructions like "add [mem], imm". class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, + : BinOpMI<mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>; // BinOpMI_RMW_FF - Instructions like "adc [mem], imm". class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, + : BinOpMI<mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), (implicit EFLAGS)]>; @@ -867,7 +871,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, // BinOpMI_F - Instructions like "cmp [mem], imm". class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> - : BinOpMI<mnemonic, typeinfo, f, + : BinOpMI<mnemonic, typeinfo, f, [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src))], opcode>; @@ -913,6 +917,7 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let ImmT = typeinfo.ImmEncoding; let Uses = [areg]; let Defs = [areg]; + let hasSideEffects = 0; } /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is @@ -928,61 +933,61 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def #NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; - def #NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; - def #NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; - def #NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; } // isCommutable - def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; - def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; - def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; - def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; - def #NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; - def #NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; - def #NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; - def #NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; + def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; + def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def #NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; - def #NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; - def #NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; - - def #NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; - def #NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; - def #NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; - def #NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; + def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; + + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; } } // Constraints = "$src1 = $dst" - def #NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def #NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; - def #NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } } /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is @@ -999,61 +1004,61 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def #NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; } // isCommutable - def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; - def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; - def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; - def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; - def #NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; - def #NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; - def #NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; - def #NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def #NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; - - def #NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; - def #NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; + def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; } } // Constraints = "$src1 = $dst" - def #NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; + def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def #NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; - def #NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1067,60 +1072,60 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, let Defs = [EFLAGS] in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def #NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; } // isCommutable - def #NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; - def #NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; - def #NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; - def #NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; - def #NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; - def #NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; - def #NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; - def #NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def #NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; - - def #NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; - def #NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; + def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; } - def #NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def #NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; - def #NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; - - def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } } @@ -1180,7 +1185,7 @@ let isCompare = 1, Defs = [EFLAGS] in { def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; - + def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, "{$src, %al|AL, $src}">; def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, @@ -1204,12 +1209,12 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, PatFrag ld_frag> { def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))], + [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))], IIC_BIN_NONMEM>; def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, - (X86andn_flag RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>; + (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { @@ -1217,6 +1222,17 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in { defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W; } +let Predicates = [HasBMI] in { + def : Pat<(and (not GR32:$src1), GR32:$src2), + (ANDN32rr GR32:$src1, GR32:$src2)>; + def : Pat<(and (not GR64:$src1), GR64:$src2), + (ANDN64rr GR64:$src1, GR64:$src2)>; + def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)), + (ANDN32rm GR32:$src1, addr:$src2)>; + def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)), + (ANDN64rm GR64:$src1, addr:$src2)>; +} + //===----------------------------------------------------------------------===// // MULX Instruction // diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index adeaf54..8f2d0a1 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -17,19 +17,19 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", isCommutable = 1 in { - def #NAME#16rr + def NAME#16rr : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))], IIC_CMOV16_RR>,TB,OpSize; - def #NAME#32rr + def NAME#32rr : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))], IIC_CMOV32_RR>, TB; - def #NAME#64rr + def NAME#64rr :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), [(set GR64:$dst, @@ -38,18 +38,18 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { } let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in { - def #NAME#16rm + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), CondNode, EFLAGS))], IIC_CMOV16_RM>, TB, OpSize; - def #NAME#32rm + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; - def #NAME#64rm + def NAME#64rm :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 99c2b8f..2a26a22 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -165,6 +165,33 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), } +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), + "#EH_SJLJ_SETJMP32", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In32BitMode]>; + def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), + "#EH_SJLJ_SETJMP64", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in { + def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), + "#EH_SJLJ_LONGJMP32", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In32BitMode]>; + def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), + "#EH_SJLJ_LONGJMP64", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; + } +} + +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { + def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + //===----------------------------------------------------------------------===// // Pseudo instructions used by segmented stacks. // @@ -230,25 +257,18 @@ def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), IIC_ALU_NONMEM>; // Use sbb to materialize carry bit. -let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in { +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1 in { // FIXME: These are pseudo ops that should be replaced with Pat<> patterns. // However, Pat<> can't replicate the destination reg into the inputs of the // result. -// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces -// X86CodeEmitter. -def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "", - [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))], - IIC_ALU_NONMEM>; -def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "", - [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))], - IIC_ALU_NONMEM>, - OpSize; -def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))], - IIC_ALU_NONMEM>; -def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))], - IIC_ALU_NONMEM>; +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; } // isCodeGenOnly @@ -453,6 +473,11 @@ def CMOV_GR16 : I<0, Pseudo, "#CMOV_GR16* PSEUDO!", [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; +} // Predicates = [NoCMov] + +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE1. +let Predicates = [FPStackf32] in def CMOV_RFP32 : I<0, Pseudo, (outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), @@ -460,6 +485,9 @@ def CMOV_RFP32 : I<0, Pseudo, [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, EFLAGS))]>; +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE2. +let Predicates = [FPStackf64] in def CMOV_RFP64 : I<0, Pseudo, (outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), @@ -474,7 +502,6 @@ def CMOV_RFP80 : I<0, Pseudo, [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, EFLAGS))]>; -} // Predicates = [NoCMov] } // UsesCustomInserter = 1, Uses = [EFLAGS] @@ -482,130 +509,74 @@ def CMOV_RFP80 : I<0, Pseudo, // Atomic Instruction Pseudo Instructions //===----------------------------------------------------------------------===// -// Atomic exchange, and, or, xor -let Constraints = "$val = $dst", Defs = [EFLAGS], - usesCustomInserter = 1 in { - -def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), - "#ATOMAND8 PSEUDO!", - [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>; -def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), - "#ATOMOR8 PSEUDO!", - [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>; -def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), - "#ATOMXOR8 PSEUDO!", - [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>; -def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), - "#ATOMNAND8 PSEUDO!", - [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>; - -def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMAND16 PSEUDO!", - [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>; -def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMOR16 PSEUDO!", - [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>; -def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMXOR16 PSEUDO!", - [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>; -def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMNAND16 PSEUDO!", - [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>; -def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val), - "#ATOMMIN16 PSEUDO!", - [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>; -def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMMAX16 PSEUDO!", - [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>; -def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMUMIN16 PSEUDO!", - [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>; -def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), - "#ATOMUMAX16 PSEUDO!", - [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>; - - -def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMAND32 PSEUDO!", - [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>; -def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMOR32 PSEUDO!", - [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>; -def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMXOR32 PSEUDO!", - [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>; -def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMNAND32 PSEUDO!", - [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>; -def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val), - "#ATOMMIN32 PSEUDO!", - [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>; -def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMMAX32 PSEUDO!", - [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>; -def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMUMIN32 PSEUDO!", - [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>; -def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), - "#ATOMUMAX32 PSEUDO!", - [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>; - - - -def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMAND64 PSEUDO!", - [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>; -def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMOR64 PSEUDO!", - [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>; -def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMXOR64 PSEUDO!", - [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>; -def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMNAND64 PSEUDO!", - [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>; -def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val), - "#ATOMMIN64 PSEUDO!", - [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>; -def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMMAX64 PSEUDO!", - [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>; -def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMUMIN64 PSEUDO!", - [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>; -def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), - "#ATOMUMAX64 PSEUDO!", - [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>; +// Pseudo atomic instructions + +multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> { + let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in { + def NAME#8 : I<0, Pseudo, (outs GR8:$dst), + (ins i8mem:$ptr, GR8:$val), + !strconcat(mnemonic, "8 PSEUDO!"), []>; + def NAME#16 : I<0, Pseudo,(outs GR16:$dst), + (ins i16mem:$ptr, GR16:$val), + !strconcat(mnemonic, "16 PSEUDO!"), []>; + def NAME#32 : I<0, Pseudo, (outs GR32:$dst), + (ins i32mem:$ptr, GR32:$val), + !strconcat(mnemonic, "32 PSEUDO!"), []>; + def NAME#64 : I<0, Pseudo, (outs GR64:$dst), + (ins i64mem:$ptr, GR64:$val), + !strconcat(mnemonic, "64 PSEUDO!"), []>; + } } -let Constraints = "$val1 = $dst1, $val2 = $dst2", - Defs = [EFLAGS, EAX, EBX, ECX, EDX], - Uses = [EAX, EBX, ECX, EDX], - mayLoad = 1, mayStore = 1, - usesCustomInserter = 1 in { -def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMAND6432 PSEUDO!", []>; -def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMOR6432 PSEUDO!", []>; -def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMXOR6432 PSEUDO!", []>; -def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMNAND6432 PSEUDO!", []>; -def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMADD6432 PSEUDO!", []>; -def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMSUB6432 PSEUDO!", []>; -def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - "#ATOMSWAP6432 PSEUDO!", []>; +multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> { + def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val), + (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val), + (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val), + (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>; + def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val), + (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>; } +// Atomic exchange, and, or, xor +defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">; +defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">; +defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">; +defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">; +defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">; +defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">; +defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">; +defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">; + +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">; +defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">; + +multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> { + let usesCustomInserter = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in + def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + !strconcat(mnemonic, "6432 PSEUDO!"), []>; +} + +defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">; +defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">; +defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">; +defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">; +defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">; +defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">; +defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">; +defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">; +defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">; +defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">; +defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// @@ -617,7 +588,6 @@ def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), // TODO: Get this to fold the constant into the instruction. let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), - "lock\n\t" "or{l}\t{$zero, $dst|$dst, $zero}", [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK; @@ -634,77 +604,77 @@ multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, Format ImmMod, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { -def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, - RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, - MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), - !strconcat("lock\n\t", mnemonic, "{b}\t", +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize, LOCK; +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, LOCK; -def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, - MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - !strconcat("lock\n\t", mnemonic, "{w}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, OpSize, LOCK; -def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, - RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, - MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - !strconcat("lock\n\t", mnemonic, "{l}\t", + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, LOCK; -def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, - RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, - MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - !strconcat("lock\n\t", mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_NONMEM>, LOCK; - -def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, - ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), - !strconcat("lock\n\t", mnemonic, "{b}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; - -def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, - ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), - !strconcat("lock\n\t", mnemonic, "{w}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; -def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, - ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), - !strconcat("lock\n\t", mnemonic, "{l}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; - -def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), - !strconcat("lock\n\t", mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; - -def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), - !strconcat("lock\n\t", mnemonic, "{w}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; -def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize, LOCK; + +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize, LOCK; +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), - !strconcat("lock\n\t", mnemonic, "{l}\t", + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, LOCK; -def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), - !strconcat("lock\n\t", mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; } @@ -717,107 +687,117 @@ defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; // Optimized codegen when the non-memory output is not used. +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { -def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), - "lock\n\t" - "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK; -def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), - "lock\n\t" - "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK; -def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), - "lock\n\t" - "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK; -def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), - "lock\n\t" - "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK; - -def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), - "lock\n\t" - "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK; -def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), - "lock\n\t" - "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK; -def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), - "lock\n\t" - "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK; -def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), - "lock\n\t" - "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK; +def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), + !strconcat(mnemonic, "{b}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), + !strconcat(mnemonic, "{w}\t$dst"), + [], IIC_UNARY_MEM>, OpSize, LOCK; +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), + !strconcat(mnemonic, "{l}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), + !strconcat(mnemonic, "{q}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +} } -// Atomic compare and swap. -let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - isCodeGenOnly = 1 in -def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), - "lock\n\t" - "cmpxchg8b\t$ptr", - [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK; +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; -let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - isCodeGenOnly = 1 in -def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), - "lock\n\t" - "cmpxchg16b\t$ptr", - [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK, - Requires<[HasCmpxchg16b]>; - -let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in { -def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap), - "lock\n\t" - "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK; +// Atomic compare and swap. +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, + SDPatternOperator frag, X86MemOperand x86memop, + InstrItinClass itin> { +let isCodeGenOnly = 1 in { + def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), + !strconcat(mnemonic, "\t$ptr"), + [(frag addr:$ptr)], itin>, TB, LOCK; +} } -let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in { -def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap), - "lock\n\t" - "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK; +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic, SDPatternOperator frag, + InstrItinClass itin8, InstrItinClass itin> { +let isCodeGenOnly = 1 in { + let Defs = [AL, EFLAGS], Uses = [AL] in + def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), + !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK; + let Defs = [AX, EFLAGS], Uses = [AX] in + def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), + !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK; + let Defs = [EAX, EFLAGS], Uses = [EAX] in + def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), + !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK; + let Defs = [RAX, EFLAGS], Uses = [RAX] in + def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), + !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK; +} } -let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in { -def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap), - "lock\n\t" - "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK; +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in { +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", + X86cas8, i64mem, + IIC_CMPX_LOCK_8B>; } -let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in { -def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap), - "lock\n\t" - "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK; +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b] in { +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", + X86cas16, i128mem, + IIC_CMPX_LOCK_16B>, REX_W; } +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", + X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; + // Atomic exchange and add -let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in { -def LXADD8 : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), - "lock\n\t" - "xadd{b}\t{$val, $ptr|$ptr, $val}", - [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))], - IIC_XADD_LOCK_MEM8>, - TB, LOCK; -def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr), - "lock\n\t" - "xadd{w}\t{$val, $ptr|$ptr, $val}", - [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))], - IIC_XADD_LOCK_MEM>, - TB, OpSize, LOCK; -def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr), - "lock\n\t" - "xadd{l}\t{$val, $ptr|$ptr, $val}", - [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))], - IIC_XADD_LOCK_MEM>, - TB, LOCK; -def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr), - "lock\n\t" - "xadd{q}\t{$val, $ptr|$ptr, $val}", - [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))], - IIC_XADD_LOCK_MEM>, - TB, LOCK; +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, + string frag, + InstrItinClass itin8, InstrItinClass itin> { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in { + def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin8>; + def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize; + def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>; + def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } } +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", + IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, + TB, LOCK; + def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), "#ACQUIRE_MOV PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; @@ -1017,7 +997,24 @@ def : Pat<(X86call (i64 tglobaladdr:$dst)), def : Pat<(X86call (i64 texternalsym:$dst)), (CALL64pcrel32 texternalsym:$dst)>; -// tailcall stuff +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they +// can never use callee-saved registers. That is the purpose of the GR64_TC +// register classes. +// +// The only volatile register that is never used by the calling convention is +// %r11. This happens when calling a vararg function with 6 arguments. +// +// Match an X86tcret that uses less than 7 volatile registers. +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 0; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) + return false; + return true; +}]>; + def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, Requires<[In32BitMode]>; @@ -1041,7 +1038,9 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, Requires<[In64BitMode]>; -def : Pat<(X86tcret (load addr:$dst), imm:$off), +// Don't fold loads into X86tcret requiring more than 6 regs. +// There wouldn't be enough scratch registers for base+index. +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 5663800..f48f133 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -42,7 +42,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, - VR256:$src3)))]>; + VR256:$src3)))]>, VEX_L; let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), @@ -51,7 +51,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, - (MemFrag256 addr:$src3))))]>; + (MemFrag256 addr:$src3))))]>, VEX_L; } } // Constraints = "$src1 = $dst" @@ -220,7 +220,7 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; // For disassembler -let isCodeGenOnly = 1 in +let isCodeGenOnly = 1, hasSideEffects = 0 in def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, @@ -280,21 +280,21 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, - VEX_W, MemOp4; + VEX_W, MemOp4, VEX_L; def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, - (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4; + (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L; def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, - (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; + [(set VR256:$dst, (OpNode VR256:$src1, + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L; // For disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -302,12 +302,11 @@ let isCodeGenOnly = 1 in { def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_L; } // isCodeGenOnly = 1 } -let Predicates = [HasFMA4] in { - defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, int_x86_fma_vfmadd_ss>; @@ -337,29 +336,33 @@ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, int_x86_fma_vfnmsub_sd>; -defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, - memopv2f64, memopv4f64>; -defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, - memopv2f64, memopv4f64>; -defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, - memopv2f64, memopv4f64>; -defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, - memopv2f64, memopv4f64>; -defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, - memopv2f64, memopv4f64>; -defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, - memopv4f32, memopv8f32>; -defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, - memopv2f64, memopv4f64>; -} // HasFMA4 +let ExeDomain = SSEPackedSingle in { + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + memopv4f32, memopv8f32>; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + memopv2f64, memopv4f64>; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +} diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 55ad2ec..6151d5c 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -44,14 +44,15 @@ def RawFrmImm16 : Format<44>; def MRM_D0 : Format<45>; def MRM_D1 : Format<46>; def MRM_D4 : Format<47>; -def MRM_D8 : Format<48>; -def MRM_D9 : Format<49>; -def MRM_DA : Format<50>; -def MRM_DB : Format<51>; -def MRM_DC : Format<52>; -def MRM_DD : Format<53>; -def MRM_DE : Format<54>; -def MRM_DF : Format<55>; +def MRM_D5 : Format<48>; +def MRM_D8 : Format<49>; +def MRM_D9 : Format<50>; +def MRM_DA : Format<51>; +def MRM_DB : Format<52>; +def MRM_DC : Format<53>; +def MRM_DD : Format<54>; +def MRM_DE : Format<55>; +def MRM_DF : Format<56>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -569,7 +570,7 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, // FMA4 Instruction Templates class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + : I<o, F, outs, ins, asm, pattern, itin>, TA, OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>; // XOP 2, 3 and 4 Operand Instruction Template diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 9035435..7025e93 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -27,6 +27,11 @@ def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>]>; +def X86umin : SDNode<"X86ISD::UMIN", SDTIntBinOp>; +def X86umax : SDNode<"X86ISD::UMAX", SDTIntBinOp>; +def X86smin : SDNode<"X86ISD::SMIN", SDTIntBinOp>; +def X86smax : SDNode<"X86ISD::SMAX", SDTIntBinOp>; + def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -90,9 +95,20 @@ def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vzext : SDNode<"X86ISD::VZEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + +def X86vsext : SDNode<"X86ISD::VSEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; + def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>]>>; +def X86vfpround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>]>>; def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; @@ -117,6 +133,7 @@ def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; @@ -176,9 +193,7 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; -def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; -def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; -def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; +def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 4f3d824..17714ac 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -17,15 +17,15 @@ #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/DerivedTypes.h" -#include "llvm/LLVMContext.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" @@ -297,7 +297,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, - { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, { X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, @@ -355,7 +355,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, // AVX 128-bit versions of foldable instructions - { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -467,9 +467,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, - { X86::SQRTPDr_Int, X86::SQRTPDm_Int, TB_ALIGN_16 }, { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, - { X86::SQRTPSr_Int, X86::SQRTPSm_Int, TB_ALIGN_16 }, { X86::SQRTSDr, X86::SQRTSDm, 0 }, { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, { X86::SQRTSSr, X86::SQRTSSm, 0 }, @@ -510,27 +508,25 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, - { X86::VMOVUPDrr, X86::VMOVUPDrm, TB_ALIGN_16 }, + { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, - { X86::VPABSBrr128, X86::VPABSBrm128, TB_ALIGN_16 }, - { X86::VPABSDrr128, X86::VPABSDrm128, TB_ALIGN_16 }, - { X86::VPABSWrr128, X86::VPABSWrm128, TB_ALIGN_16 }, - { X86::VPERMILPDri, X86::VPERMILPDmi, TB_ALIGN_16 }, - { X86::VPERMILPSri, X86::VPERMILPSmi, TB_ALIGN_16 }, - { X86::VPSHUFDri, X86::VPSHUFDmi, TB_ALIGN_16 }, - { X86::VPSHUFHWri, X86::VPSHUFHWmi, TB_ALIGN_16 }, - { X86::VPSHUFLWri, X86::VPSHUFLWmi, TB_ALIGN_16 }, - { X86::VRCPPSr, X86::VRCPPSm, TB_ALIGN_16 }, - { X86::VRCPPSr_Int, X86::VRCPPSm_Int, TB_ALIGN_16 }, - { X86::VRSQRTPSr, X86::VRSQRTPSm, TB_ALIGN_16 }, - { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, TB_ALIGN_16 }, - { X86::VSQRTPDr, X86::VSQRTPDm, TB_ALIGN_16 }, - { X86::VSQRTPDr_Int, X86::VSQRTPDm_Int, TB_ALIGN_16 }, - { X86::VSQRTPSr, X86::VSQRTPSm, TB_ALIGN_16 }, - { X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 }, + { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, + { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, + { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, + { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, + { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, + { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, + { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, + { X86::VRCPPSr, X86::VRCPPSm, 0 }, + { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, + { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, + { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, + { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, + { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, @@ -541,26 +537,52 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, - { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 }, - { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 }, + { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, + { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, // AVX2 foldable instructions - { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 }, - { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 }, - { X86::VPABSWrr256, X86::VPABSWrm256, TB_ALIGN_32 }, - { X86::VPSHUFDYri, X86::VPSHUFDYmi, TB_ALIGN_32 }, - { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, TB_ALIGN_32 }, - { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, TB_ALIGN_32 }, - { X86::VRCPPSYr, X86::VRCPPSYm, TB_ALIGN_32 }, - { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, TB_ALIGN_32 }, - { X86::VRSQRTPSYr, X86::VRSQRTPSYm, TB_ALIGN_32 }, - { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, TB_ALIGN_32 }, - { X86::VSQRTPDYr, X86::VSQRTPDYm, TB_ALIGN_32 }, - { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 }, - { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 }, - { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 }, + { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, + { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, + { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, + { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, + { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, + { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, + { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, + { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, + + // BMI/BMI2/LZCNT/POPCNT foldable instructions + { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, + { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, + { X86::BLSI32rr, X86::BLSI32rm, 0 }, + { X86::BLSI64rr, X86::BLSI64rm, 0 }, + { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, + { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, + { X86::BLSR32rr, X86::BLSR32rm, 0 }, + { X86::BLSR64rr, X86::BLSR64rm, 0 }, + { X86::BZHI32rr, X86::BZHI32rm, 0 }, + { X86::BZHI64rr, X86::BZHI64rm, 0 }, + { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, + { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, + { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, + { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, + { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, + { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, + { X86::RORX32ri, X86::RORX32mi, 0 }, + { X86::RORX64ri, X86::RORX64mi, 0 }, + { X86::SARX32rr, X86::SARX32rm, 0 }, + { X86::SARX64rr, X86::SARX64rm, 0 }, + { X86::SHRX32rr, X86::SHRX32rm, 0 }, + { X86::SHRX64rr, X86::SHRX64rm, 0 }, + { X86::SHLX32rr, X86::SHLX32rm, 0 }, + { X86::SHLX64rr, X86::SHLX64rm, 0 }, + { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, + { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, + { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { @@ -681,21 +703,13 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, - { X86::MAXPDrr_Int, X86::MAXPDrm_Int, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, - { X86::MAXPSrr_Int, X86::MAXPSrm_Int, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, - { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, - { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, - { X86::MINPDrr_Int, X86::MINPDrm_Int, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, - { X86::MINPSrr_Int, X86::MINPSrm_Int, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, - { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, - { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, @@ -746,6 +760,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, + { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, + { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, + { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, + { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, + { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, + { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, + { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, + { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, @@ -817,31 +839,31 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, TB_ALIGN_16 }, - { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, TB_ALIGN_16 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, - { X86::VADDPDrr, X86::VADDPDrm, TB_ALIGN_16 }, - { X86::VADDPSrr, X86::VADDPSrm, TB_ALIGN_16 }, + { X86::VADDPDrr, X86::VADDPDrm, 0 }, + { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, - { X86::VADDSUBPDrr, X86::VADDSUBPDrm, TB_ALIGN_16 }, - { X86::VADDSUBPSrr, X86::VADDSUBPSrm, TB_ALIGN_16 }, - { X86::VANDNPDrr, X86::VANDNPDrm, TB_ALIGN_16 }, - { X86::VANDNPSrr, X86::VANDNPSrm, TB_ALIGN_16 }, - { X86::VANDPDrr, X86::VANDPDrm, TB_ALIGN_16 }, - { X86::VANDPSrr, X86::VANDPSrm, TB_ALIGN_16 }, - { X86::VBLENDPDrri, X86::VBLENDPDrmi, TB_ALIGN_16 }, - { X86::VBLENDPSrri, X86::VBLENDPSrmi, TB_ALIGN_16 }, - { X86::VBLENDVPDrr, X86::VBLENDVPDrm, TB_ALIGN_16 }, - { X86::VBLENDVPSrr, X86::VBLENDVPSrm, TB_ALIGN_16 }, - { X86::VCMPPDrri, X86::VCMPPDrmi, TB_ALIGN_16 }, - { X86::VCMPPSrri, X86::VCMPPSrmi, TB_ALIGN_16 }, + { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, + { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, + { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, + { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, + { X86::VANDPDrr, X86::VANDPDrm, 0 }, + { X86::VANDPSrr, X86::VANDPSrm, 0 }, + { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, + { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, + { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, + { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, + { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, + { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, - { X86::VDIVPDrr, X86::VDIVPDrm, TB_ALIGN_16 }, - { X86::VDIVPSrr, X86::VDIVPSrm, TB_ALIGN_16 }, + { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, + { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, @@ -852,282 +874,290 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, - { X86::VHADDPDrr, X86::VHADDPDrm, TB_ALIGN_16 }, - { X86::VHADDPSrr, X86::VHADDPSrm, TB_ALIGN_16 }, - { X86::VHSUBPDrr, X86::VHSUBPDrm, TB_ALIGN_16 }, - { X86::VHSUBPSrr, X86::VHSUBPSrm, TB_ALIGN_16 }, + { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, + { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, + { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, + { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, - { X86::VMAXPDrr, X86::VMAXPDrm, TB_ALIGN_16 }, - { X86::VMAXPDrr_Int, X86::VMAXPDrm_Int, TB_ALIGN_16 }, - { X86::VMAXPSrr, X86::VMAXPSrm, TB_ALIGN_16 }, - { X86::VMAXPSrr_Int, X86::VMAXPSrm_Int, TB_ALIGN_16 }, + { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, + { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, - { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, - { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, - { X86::VMINPDrr, X86::VMINPDrm, TB_ALIGN_16 }, - { X86::VMINPDrr_Int, X86::VMINPDrm_Int, TB_ALIGN_16 }, - { X86::VMINPSrr, X86::VMINPSrm, TB_ALIGN_16 }, - { X86::VMINPSrr_Int, X86::VMINPSrm_Int, TB_ALIGN_16 }, + { X86::VMINPDrr, X86::VMINPDrm, 0 }, + { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, - { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, - { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, - { X86::VMPSADBWrri, X86::VMPSADBWrmi, TB_ALIGN_16 }, - { X86::VMULPDrr, X86::VMULPDrm, TB_ALIGN_16 }, - { X86::VMULPSrr, X86::VMULPSrm, TB_ALIGN_16 }, + { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, + { X86::VMULPDrr, X86::VMULPDrm, 0 }, + { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, - { X86::VORPDrr, X86::VORPDrm, TB_ALIGN_16 }, - { X86::VORPSrr, X86::VORPSrm, TB_ALIGN_16 }, - { X86::VPACKSSDWrr, X86::VPACKSSDWrm, TB_ALIGN_16 }, - { X86::VPACKSSWBrr, X86::VPACKSSWBrm, TB_ALIGN_16 }, - { X86::VPACKUSDWrr, X86::VPACKUSDWrm, TB_ALIGN_16 }, - { X86::VPACKUSWBrr, X86::VPACKUSWBrm, TB_ALIGN_16 }, - { X86::VPADDBrr, X86::VPADDBrm, TB_ALIGN_16 }, - { X86::VPADDDrr, X86::VPADDDrm, TB_ALIGN_16 }, - { X86::VPADDQrr, X86::VPADDQrm, TB_ALIGN_16 }, - { X86::VPADDSBrr, X86::VPADDSBrm, TB_ALIGN_16 }, - { X86::VPADDSWrr, X86::VPADDSWrm, TB_ALIGN_16 }, - { X86::VPADDUSBrr, X86::VPADDUSBrm, TB_ALIGN_16 }, - { X86::VPADDUSWrr, X86::VPADDUSWrm, TB_ALIGN_16 }, - { X86::VPADDWrr, X86::VPADDWrm, TB_ALIGN_16 }, - { X86::VPALIGNR128rr, X86::VPALIGNR128rm, TB_ALIGN_16 }, - { X86::VPANDNrr, X86::VPANDNrm, TB_ALIGN_16 }, - { X86::VPANDrr, X86::VPANDrm, TB_ALIGN_16 }, - { X86::VPAVGBrr, X86::VPAVGBrm, TB_ALIGN_16 }, - { X86::VPAVGWrr, X86::VPAVGWrm, TB_ALIGN_16 }, - { X86::VPBLENDWrri, X86::VPBLENDWrmi, TB_ALIGN_16 }, - { X86::VPCMPEQBrr, X86::VPCMPEQBrm, TB_ALIGN_16 }, - { X86::VPCMPEQDrr, X86::VPCMPEQDrm, TB_ALIGN_16 }, - { X86::VPCMPEQQrr, X86::VPCMPEQQrm, TB_ALIGN_16 }, - { X86::VPCMPEQWrr, X86::VPCMPEQWrm, TB_ALIGN_16 }, - { X86::VPCMPGTBrr, X86::VPCMPGTBrm, TB_ALIGN_16 }, - { X86::VPCMPGTDrr, X86::VPCMPGTDrm, TB_ALIGN_16 }, - { X86::VPCMPGTQrr, X86::VPCMPGTQrm, TB_ALIGN_16 }, - { X86::VPCMPGTWrr, X86::VPCMPGTWrm, TB_ALIGN_16 }, - { X86::VPHADDDrr, X86::VPHADDDrm, TB_ALIGN_16 }, - { X86::VPHADDSWrr128, X86::VPHADDSWrm128, TB_ALIGN_16 }, - { X86::VPHADDWrr, X86::VPHADDWrm, TB_ALIGN_16 }, - { X86::VPHSUBDrr, X86::VPHSUBDrm, TB_ALIGN_16 }, - { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, TB_ALIGN_16 }, - { X86::VPHSUBWrr, X86::VPHSUBWrm, TB_ALIGN_16 }, - { X86::VPERMILPDrr, X86::VPERMILPDrm, TB_ALIGN_16 }, - { X86::VPERMILPSrr, X86::VPERMILPSrm, TB_ALIGN_16 }, - { X86::VPINSRWrri, X86::VPINSRWrmi, TB_ALIGN_16 }, - { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, TB_ALIGN_16 }, - { X86::VPMADDWDrr, X86::VPMADDWDrm, TB_ALIGN_16 }, - { X86::VPMAXSWrr, X86::VPMAXSWrm, TB_ALIGN_16 }, - { X86::VPMAXUBrr, X86::VPMAXUBrm, TB_ALIGN_16 }, - { X86::VPMINSWrr, X86::VPMINSWrm, TB_ALIGN_16 }, - { X86::VPMINUBrr, X86::VPMINUBrm, TB_ALIGN_16 }, - { X86::VPMULDQrr, X86::VPMULDQrm, TB_ALIGN_16 }, - { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, TB_ALIGN_16 }, - { X86::VPMULHUWrr, X86::VPMULHUWrm, TB_ALIGN_16 }, - { X86::VPMULHWrr, X86::VPMULHWrm, TB_ALIGN_16 }, - { X86::VPMULLDrr, X86::VPMULLDrm, TB_ALIGN_16 }, - { X86::VPMULLWrr, X86::VPMULLWrm, TB_ALIGN_16 }, - { X86::VPMULUDQrr, X86::VPMULUDQrm, TB_ALIGN_16 }, - { X86::VPORrr, X86::VPORrm, TB_ALIGN_16 }, - { X86::VPSADBWrr, X86::VPSADBWrm, TB_ALIGN_16 }, - { X86::VPSHUFBrr, X86::VPSHUFBrm, TB_ALIGN_16 }, - { X86::VPSIGNBrr, X86::VPSIGNBrm, TB_ALIGN_16 }, - { X86::VPSIGNWrr, X86::VPSIGNWrm, TB_ALIGN_16 }, - { X86::VPSIGNDrr, X86::VPSIGNDrm, TB_ALIGN_16 }, - { X86::VPSLLDrr, X86::VPSLLDrm, TB_ALIGN_16 }, - { X86::VPSLLQrr, X86::VPSLLQrm, TB_ALIGN_16 }, - { X86::VPSLLWrr, X86::VPSLLWrm, TB_ALIGN_16 }, - { X86::VPSRADrr, X86::VPSRADrm, TB_ALIGN_16 }, - { X86::VPSRAWrr, X86::VPSRAWrm, TB_ALIGN_16 }, - { X86::VPSRLDrr, X86::VPSRLDrm, TB_ALIGN_16 }, - { X86::VPSRLQrr, X86::VPSRLQrm, TB_ALIGN_16 }, - { X86::VPSRLWrr, X86::VPSRLWrm, TB_ALIGN_16 }, - { X86::VPSUBBrr, X86::VPSUBBrm, TB_ALIGN_16 }, - { X86::VPSUBDrr, X86::VPSUBDrm, TB_ALIGN_16 }, - { X86::VPSUBSBrr, X86::VPSUBSBrm, TB_ALIGN_16 }, - { X86::VPSUBSWrr, X86::VPSUBSWrm, TB_ALIGN_16 }, - { X86::VPSUBWrr, X86::VPSUBWrm, TB_ALIGN_16 }, - { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, TB_ALIGN_16 }, - { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, TB_ALIGN_16 }, - { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, TB_ALIGN_16 }, - { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, TB_ALIGN_16 }, - { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, TB_ALIGN_16 }, - { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, TB_ALIGN_16 }, - { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, TB_ALIGN_16 }, - { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, TB_ALIGN_16 }, - { X86::VPXORrr, X86::VPXORrm, TB_ALIGN_16 }, - { X86::VSHUFPDrri, X86::VSHUFPDrmi, TB_ALIGN_16 }, - { X86::VSHUFPSrri, X86::VSHUFPSrmi, TB_ALIGN_16 }, - { X86::VSUBPDrr, X86::VSUBPDrm, TB_ALIGN_16 }, - { X86::VSUBPSrr, X86::VSUBPSrm, TB_ALIGN_16 }, + { X86::VORPDrr, X86::VORPDrm, 0 }, + { X86::VORPSrr, X86::VORPSrm, 0 }, + { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, + { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, + { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, + { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, + { X86::VPADDBrr, X86::VPADDBrm, 0 }, + { X86::VPADDDrr, X86::VPADDDrm, 0 }, + { X86::VPADDQrr, X86::VPADDQrm, 0 }, + { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, + { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, + { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, + { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, + { X86::VPADDWrr, X86::VPADDWrm, 0 }, + { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 }, + { X86::VPANDNrr, X86::VPANDNrm, 0 }, + { X86::VPANDrr, X86::VPANDrm, 0 }, + { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, + { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, + { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, + { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, + { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, + { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, + { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, + { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, + { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, + { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, + { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, + { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, + { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, + { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, + { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, + { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, + { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, + { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, + { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, + { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, + { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, + { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, + { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, + { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, + { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, + { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, + { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, + { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, + { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, + { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, + { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, + { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, + { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, + { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, + { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, + { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, + { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, + { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, + { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, + { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, + { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, + { X86::VPORrr, X86::VPORrm, 0 }, + { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, + { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, + { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 }, + { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 }, + { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 }, + { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, + { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, + { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, + { X86::VPSRADrr, X86::VPSRADrm, 0 }, + { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, + { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, + { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, + { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, + { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, + { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, + { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, + { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, + { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, + { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, + { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, + { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, + { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, + { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, + { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, + { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, + { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, + { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, + { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, + { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, + { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, - { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, TB_ALIGN_16 }, - { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, TB_ALIGN_16 }, - { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, TB_ALIGN_16 }, - { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, TB_ALIGN_16 }, - { X86::VXORPDrr, X86::VXORPDrm, TB_ALIGN_16 }, - { X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 }, + { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, + { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, + { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, + { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, + { X86::VXORPDrr, X86::VXORPDrm, 0 }, + { X86::VXORPSrr, X86::VXORPSrm, 0 }, // AVX 256-bit foldable instructions - { X86::VADDPDYrr, X86::VADDPDYrm, TB_ALIGN_32 }, - { X86::VADDPSYrr, X86::VADDPSYrm, TB_ALIGN_32 }, - { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, TB_ALIGN_32 }, - { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, TB_ALIGN_32 }, - { X86::VANDNPDYrr, X86::VANDNPDYrm, TB_ALIGN_32 }, - { X86::VANDNPSYrr, X86::VANDNPSYrm, TB_ALIGN_32 }, - { X86::VANDPDYrr, X86::VANDPDYrm, TB_ALIGN_32 }, - { X86::VANDPSYrr, X86::VANDPSYrm, TB_ALIGN_32 }, - { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, TB_ALIGN_32 }, - { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, TB_ALIGN_32 }, - { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, TB_ALIGN_32 }, - { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, TB_ALIGN_32 }, - { X86::VCMPPDYrri, X86::VCMPPDYrmi, TB_ALIGN_32 }, - { X86::VCMPPSYrri, X86::VCMPPSYrmi, TB_ALIGN_32 }, - { X86::VDIVPDYrr, X86::VDIVPDYrm, TB_ALIGN_32 }, - { X86::VDIVPSYrr, X86::VDIVPSYrm, TB_ALIGN_32 }, - { X86::VHADDPDYrr, X86::VHADDPDYrm, TB_ALIGN_32 }, - { X86::VHADDPSYrr, X86::VHADDPSYrm, TB_ALIGN_32 }, - { X86::VHSUBPDYrr, X86::VHSUBPDYrm, TB_ALIGN_32 }, - { X86::VHSUBPSYrr, X86::VHSUBPSYrm, TB_ALIGN_32 }, - { X86::VINSERTF128rr, X86::VINSERTF128rm, TB_ALIGN_32 }, - { X86::VMAXPDYrr, X86::VMAXPDYrm, TB_ALIGN_32 }, - { X86::VMAXPDYrr_Int, X86::VMAXPDYrm_Int, TB_ALIGN_32 }, - { X86::VMAXPSYrr, X86::VMAXPSYrm, TB_ALIGN_32 }, - { X86::VMAXPSYrr_Int, X86::VMAXPSYrm_Int, TB_ALIGN_32 }, - { X86::VMINPDYrr, X86::VMINPDYrm, TB_ALIGN_32 }, - { X86::VMINPDYrr_Int, X86::VMINPDYrm_Int, TB_ALIGN_32 }, - { X86::VMINPSYrr, X86::VMINPSYrm, TB_ALIGN_32 }, - { X86::VMINPSYrr_Int, X86::VMINPSYrm_Int, TB_ALIGN_32 }, - { X86::VMULPDYrr, X86::VMULPDYrm, TB_ALIGN_32 }, - { X86::VMULPSYrr, X86::VMULPSYrm, TB_ALIGN_32 }, - { X86::VORPDYrr, X86::VORPDYrm, TB_ALIGN_32 }, - { X86::VORPSYrr, X86::VORPSYrm, TB_ALIGN_32 }, - { X86::VPERM2F128rr, X86::VPERM2F128rm, TB_ALIGN_32 }, - { X86::VPERMILPDYrr, X86::VPERMILPDYrm, TB_ALIGN_32 }, - { X86::VPERMILPSYrr, X86::VPERMILPSYrm, TB_ALIGN_32 }, - { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, TB_ALIGN_32 }, - { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, TB_ALIGN_32 }, - { X86::VSUBPDYrr, X86::VSUBPDYrm, TB_ALIGN_32 }, - { X86::VSUBPSYrr, X86::VSUBPSYrm, TB_ALIGN_32 }, - { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, TB_ALIGN_32 }, - { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, TB_ALIGN_32 }, - { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, TB_ALIGN_32 }, - { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, TB_ALIGN_32 }, - { X86::VXORPDYrr, X86::VXORPDYrm, TB_ALIGN_32 }, - { X86::VXORPSYrr, X86::VXORPSYrm, TB_ALIGN_32 }, + { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, + { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, + { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, + { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, + { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, + { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, + { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, + { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, + { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, + { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, + { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, + { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, + { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, + { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, + { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, + { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, + { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, + { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, + { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, + { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, + { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, + { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, + { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, + { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, + { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, + { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, + { X86::VORPDYrr, X86::VORPDYrm, 0 }, + { X86::VORPSYrr, X86::VORPSYrm, 0 }, + { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, + { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, + { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, + { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, + { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, + { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, + { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, + { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, + { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, + { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, + { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, + { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, + { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, // AVX2 foldable instructions - { X86::VINSERTI128rr, X86::VINSERTI128rm, TB_ALIGN_16 }, - { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, TB_ALIGN_32 }, - { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, TB_ALIGN_32 }, - { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, TB_ALIGN_32 }, - { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, TB_ALIGN_32 }, - { X86::VPADDBYrr, X86::VPADDBYrm, TB_ALIGN_32 }, - { X86::VPADDDYrr, X86::VPADDDYrm, TB_ALIGN_32 }, - { X86::VPADDQYrr, X86::VPADDQYrm, TB_ALIGN_32 }, - { X86::VPADDSBYrr, X86::VPADDSBYrm, TB_ALIGN_32 }, - { X86::VPADDSWYrr, X86::VPADDSWYrm, TB_ALIGN_32 }, - { X86::VPADDUSBYrr, X86::VPADDUSBYrm, TB_ALIGN_32 }, - { X86::VPADDUSWYrr, X86::VPADDUSWYrm, TB_ALIGN_32 }, - { X86::VPADDWYrr, X86::VPADDWYrm, TB_ALIGN_32 }, - { X86::VPALIGNR256rr, X86::VPALIGNR256rm, TB_ALIGN_32 }, - { X86::VPANDNYrr, X86::VPANDNYrm, TB_ALIGN_32 }, - { X86::VPANDYrr, X86::VPANDYrm, TB_ALIGN_32 }, - { X86::VPAVGBYrr, X86::VPAVGBYrm, TB_ALIGN_32 }, - { X86::VPAVGWYrr, X86::VPAVGWYrm, TB_ALIGN_32 }, - { X86::VPBLENDDrri, X86::VPBLENDDrmi, TB_ALIGN_32 }, - { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, TB_ALIGN_32 }, - { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, TB_ALIGN_32 }, - { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, TB_ALIGN_32 }, - { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, TB_ALIGN_32 }, - { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, TB_ALIGN_32 }, - { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, TB_ALIGN_32 }, - { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, TB_ALIGN_32 }, - { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, TB_ALIGN_32 }, - { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, TB_ALIGN_32 }, - { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 }, - { X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 }, - { X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 }, - { X86::VPERMPDYri, X86::VPERMPDYmi, TB_ALIGN_32 }, - { X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 }, - { X86::VPERMQYri, X86::VPERMQYmi, TB_ALIGN_32 }, - { X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 }, - { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 }, - { X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 }, - { X86::VPHSUBDYrr, X86::VPHSUBDYrm, TB_ALIGN_32 }, - { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, TB_ALIGN_32 }, - { X86::VPHSUBWYrr, X86::VPHSUBWYrm, TB_ALIGN_32 }, - { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, TB_ALIGN_32 }, - { X86::VPMADDWDYrr, X86::VPMADDWDYrm, TB_ALIGN_32 }, - { X86::VPMAXSWYrr, X86::VPMAXSWYrm, TB_ALIGN_32 }, - { X86::VPMAXUBYrr, X86::VPMAXUBYrm, TB_ALIGN_32 }, - { X86::VPMINSWYrr, X86::VPMINSWYrm, TB_ALIGN_32 }, - { X86::VPMINUBYrr, X86::VPMINUBYrm, TB_ALIGN_32 }, - { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, TB_ALIGN_32 }, - { X86::VPMULDQYrr, X86::VPMULDQYrm, TB_ALIGN_32 }, - { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, TB_ALIGN_32 }, - { X86::VPMULHUWYrr, X86::VPMULHUWYrm, TB_ALIGN_32 }, - { X86::VPMULHWYrr, X86::VPMULHWYrm, TB_ALIGN_32 }, - { X86::VPMULLDYrr, X86::VPMULLDYrm, TB_ALIGN_32 }, - { X86::VPMULLWYrr, X86::VPMULLWYrm, TB_ALIGN_32 }, - { X86::VPMULUDQYrr, X86::VPMULUDQYrm, TB_ALIGN_32 }, - { X86::VPORYrr, X86::VPORYrm, TB_ALIGN_32 }, - { X86::VPSADBWYrr, X86::VPSADBWYrm, TB_ALIGN_32 }, - { X86::VPSHUFBYrr, X86::VPSHUFBYrm, TB_ALIGN_32 }, - { X86::VPSIGNBYrr, X86::VPSIGNBYrm, TB_ALIGN_32 }, - { X86::VPSIGNWYrr, X86::VPSIGNWYrm, TB_ALIGN_32 }, - { X86::VPSIGNDYrr, X86::VPSIGNDYrm, TB_ALIGN_32 }, - { X86::VPSLLDYrr, X86::VPSLLDYrm, TB_ALIGN_16 }, - { X86::VPSLLQYrr, X86::VPSLLQYrm, TB_ALIGN_16 }, - { X86::VPSLLWYrr, X86::VPSLLWYrm, TB_ALIGN_16 }, - { X86::VPSLLVDrr, X86::VPSLLVDrm, TB_ALIGN_16 }, - { X86::VPSLLVDYrr, X86::VPSLLVDYrm, TB_ALIGN_32 }, - { X86::VPSLLVQrr, X86::VPSLLVQrm, TB_ALIGN_16 }, - { X86::VPSLLVQYrr, X86::VPSLLVQYrm, TB_ALIGN_32 }, - { X86::VPSRADYrr, X86::VPSRADYrm, TB_ALIGN_16 }, - { X86::VPSRAWYrr, X86::VPSRAWYrm, TB_ALIGN_16 }, - { X86::VPSRAVDrr, X86::VPSRAVDrm, TB_ALIGN_16 }, - { X86::VPSRAVDYrr, X86::VPSRAVDYrm, TB_ALIGN_32 }, - { X86::VPSRLDYrr, X86::VPSRLDYrm, TB_ALIGN_16 }, - { X86::VPSRLQYrr, X86::VPSRLQYrm, TB_ALIGN_16 }, - { X86::VPSRLWYrr, X86::VPSRLWYrm, TB_ALIGN_16 }, - { X86::VPSRLVDrr, X86::VPSRLVDrm, TB_ALIGN_16 }, - { X86::VPSRLVDYrr, X86::VPSRLVDYrm, TB_ALIGN_32 }, - { X86::VPSRLVQrr, X86::VPSRLVQrm, TB_ALIGN_16 }, - { X86::VPSRLVQYrr, X86::VPSRLVQYrm, TB_ALIGN_32 }, - { X86::VPSUBBYrr, X86::VPSUBBYrm, TB_ALIGN_32 }, - { X86::VPSUBDYrr, X86::VPSUBDYrm, TB_ALIGN_32 }, - { X86::VPSUBSBYrr, X86::VPSUBSBYrm, TB_ALIGN_32 }, - { X86::VPSUBSWYrr, X86::VPSUBSWYrm, TB_ALIGN_32 }, - { X86::VPSUBWYrr, X86::VPSUBWYrm, TB_ALIGN_32 }, - { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, TB_ALIGN_32 }, - { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, TB_ALIGN_32 }, - { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, TB_ALIGN_16 }, - { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, TB_ALIGN_32 }, - { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, TB_ALIGN_32 }, - { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, TB_ALIGN_32 }, - { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, TB_ALIGN_32 }, - { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 }, - { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 }, + { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, + { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, + { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, + { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, + { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, + { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, + { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, + { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, + { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, + { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, + { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, + { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, + { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, + { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 }, + { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, + { X86::VPANDYrr, X86::VPANDYrm, 0 }, + { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, + { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, + { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, + { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, + { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, + { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, + { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, + { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, + { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, + { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, + { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, + { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, + { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, + { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, + { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, + { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, + { X86::VPERMQYri, X86::VPERMQYmi, 0 }, + { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, + { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, + { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, + { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, + { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, + { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, + { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, + { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, + { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, + { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, + { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, + { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, + { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, + { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, + { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, + { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, + { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, + { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, + { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, + { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, + { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, + { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, + { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, + { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, + { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, + { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, + { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, + { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, + { X86::VPORYrr, X86::VPORYrm, 0 }, + { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, + { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, + { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 }, + { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 }, + { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 }, + { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, + { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, + { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, + { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, + { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, + { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, + { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, + { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, + { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, + { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, + { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, + { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, + { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, + { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, + { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, + { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, + { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, + { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, + { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, + { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, + { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, + { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, + { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, + { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, + { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, + { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, + { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, + { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, + { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, + { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, + { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, + { X86::VPXORYrr, X86::VPXORYrm, 0 }, // FIXME: add AVX 256-bit foldable instructions // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_16 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_16 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_16 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_16 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, @@ -1140,6 +1170,16 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, + + // BMI/BMI2 foldable instructions + { X86::ANDN32rr, X86::ANDN32rm, 0 }, + { X86::ANDN64rr, X86::ANDN64rm, 0 }, + { X86::MULX32rr, X86::MULX32rm, 0 }, + { X86::MULX64rr, X86::MULX64rm, 0 }, + { X86::PDEP32rr, X86::PDEP32rm, 0 }, + { X86::PDEP64rr, X86::PDEP64rm, 0 }, + { X86::PEXT32rr, X86::PEXT32rm, 0 }, + { X86::PEXT64rr, X86::PEXT64rm, 0 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { @@ -1269,22 +1309,26 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_16 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_16 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 }, { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_16 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_16 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, @@ -1529,16 +1573,19 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVDQArm: + case X86::MOVDQUrm: case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: + case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::FsVMOVAPSrm: @@ -2137,7 +2184,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfoImpl::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -2216,7 +2263,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Fallthrough intended. } default: - return TargetInstrInfoImpl::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI); } } @@ -2266,7 +2313,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) { } /// getCondFromCmovOpc - return condition code of a CMov opcode. -static X86::CondCode getCondFromCMovOpc(unsigned Opc) { +X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: @@ -2818,6 +2865,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - colobbersTheStack. if (SrcReg == X86::EFLAGS) { if (X86::GR64RegClass.contains(DestReg)) { BuildMI(MBB, MI, DL, get(X86::PUSHF64)); @@ -3127,11 +3176,15 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: + case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: + case X86::DEC64_32r: case X86::DEC64_16r: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: + case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: + case X86::INC64_32r: case X86::INC64_16r: case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: @@ -3147,6 +3200,8 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::ANDN32rr: case X86::ANDN32rm: + case X86::ANDN64rr: case X86::ANDN64rm: return true; } } @@ -3306,7 +3361,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, if (OldCC != X86::COND_INVALID) OpcIsSET = true; else - OldCC = getCondFromCMovOpc(Instr.getOpcode()); + OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); } if (OldCC == X86::COND_INVALID) return false; } @@ -3371,12 +3426,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst); } - // Make sure Sub instruction defines EFLAGS. + // Make sure Sub instruction defines EFLAGS and mark the def live. + unsigned LastOperand = Sub->getNumOperands() - 1; assert(Sub->getNumOperands() >= 2 && - Sub->getOperand(Sub->getNumOperands()-1).isReg() && - Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS && + Sub->getOperand(LastOperand).isReg() && + Sub->getOperand(LastOperand).getReg() == X86::EFLAGS && "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND"); - Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true); + Sub->getOperand(LastOperand).setIsDef(true); + Sub->getOperand(LastOperand).setIsDead(false); CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. @@ -3467,35 +3524,44 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, /// to: /// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> /// -static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) { +static bool Expand2AddrUndef(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); - unsigned Reg = MI->getOperand(0).getReg(); - MI->setDesc(Desc); + unsigned Reg = MIB->getOperand(0).getReg(); + MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any // implicit operands. - MachineInstrBuilder(MI).addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); // But we don't trust that. - assert(MI->getOperand(1).getReg() == Reg && - MI->getOperand(2).getReg() == Reg && "Misplaced operand"); + assert(MIB->getOperand(1).getReg() == Reg && + MIB->getOperand(2).getReg() == Reg && "Misplaced operand"); return true; } bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); switch (MI->getOpcode()) { + case X86::SETB_C8r: + return Expand2AddrUndef(MIB, get(X86::SBB8rr)); + case X86::SETB_C16r: + return Expand2AddrUndef(MIB, get(X86::SBB16rr)); + case X86::SETB_C32r: + return Expand2AddrUndef(MIB, get(X86::SBB32rr)); + case X86::SETB_C64r: + return Expand2AddrUndef(MIB, get(X86::SBB64rr)); case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: - return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: assert(HasAVX && "AVX not supported"); - return Expand2AddrUndef(MI, get(X86::VXORPSYrr)); + return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); case X86::V_SETALLONES: - return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: - return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr)); + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; @@ -3521,9 +3587,10 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, MachineInstr *MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. + // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); - MachineInstrBuilder MIB(NewMI); + MachineInstrBuilder MIB(MF, NewMI); unsigned NumAddrOps = MOs.size(); for (unsigned i = 0; i != NumAddrOps; ++i) MIB.addOperand(MOs[i]); @@ -3547,9 +3614,10 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, const SmallVectorImpl<MachineOperand> &MOs, MachineInstr *MI, const TargetInstrInfo &TII) { + // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); - MachineInstrBuilder MIB(NewMI); + MachineInstrBuilder MIB(MF, NewMI); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); @@ -3796,7 +3864,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) && + if (!MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return 0; @@ -3837,7 +3906,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) && + if (!MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return 0; @@ -3940,6 +4010,21 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } default: { + if ((LoadMI->getOpcode() == X86::MOVSSrm || + LoadMI->getOpcode() == X86::VMOVSSrm) && + MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() + > 4) + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes). + return NULL; + if ((LoadMI->getOpcode() == X86::MOVSDrm || + LoadMI->getOpcode() == X86::VMOVSDrm) && + MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() + > 8) + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes). + return NULL; + // Folding a normal load. Just copy the load's address operands. unsigned NumOps = LoadMI->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -4007,7 +4092,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) return true; - return TargetInstrInfoImpl::canFoldMemoryOperand(MI, Ops); + return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); } bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, @@ -4072,7 +4157,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // Emit the data processing instruction. MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); - MachineInstrBuilder MIB(DataMI); + MachineInstrBuilder MIB(MF, DataMI); if (FoldedStore) MIB.addReg(Reg, RegState::Define); @@ -4578,13 +4663,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::DIVSSrr: case X86::DIVSSrr_Int: case X86::SQRTPDm: - case X86::SQRTPDm_Int: case X86::SQRTPDr: - case X86::SQRTPDr_Int: case X86::SQRTPSm: - case X86::SQRTPSm_Int: case X86::SQRTPSr: - case X86::SQRTPSr_Int: case X86::SQRTSDm: case X86::SQRTSDm_Int: case X86::SQRTSDr: @@ -4603,13 +4684,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVSSrr: case X86::VDIVSSrr_Int: case X86::VSQRTPDm: - case X86::VSQRTPDm_Int: case X86::VSQRTPDr: - case X86::VSQRTPDr_Int: case X86::VSQRTPSm: - case X86::VSQRTPSm_Int: case X86::VSQRTPSr: - case X86::VSQRTPSr_Int: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: case X86::VSQRTSDr: diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index b6f69af..260f054 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -61,6 +61,9 @@ namespace X86 { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); + // Turn CMov opcode into condition code. + CondCode getCondFromCMovOpc(unsigned Opc); + /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(X86::CondCode CC); diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 304676d..9ecf5e2 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -216,6 +216,14 @@ def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, [SDNPHasChain]>; +def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -239,9 +247,9 @@ def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>; -def X86blsi_flag : SDNode<"X86ISD::BLSI", SDTUnaryArithWithFlags>; -def X86blsmsk_flag : SDNode<"X86ISD::BLSMSK", SDTUnaryArithWithFlags>; -def X86blsr_flag : SDNode<"X86ISD::BLSR", SDTUnaryArithWithFlags>; +def X86blsi : SDNode<"X86ISD::BLSI", SDTIntUnaryOp>; +def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>; +def X86blsr : SDNode<"X86ISD::BLSR", SDTIntUnaryOp>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; @@ -397,7 +405,7 @@ def i64mem_TC : Operand<i64> { let OperandType = "OPERAND_PCREL", ParserMatchClass = X86AbsMemAsmOperand, - PrintMethod = "print_pcrel_imm" in { + PrintMethod = "printPCRelImm" in { def i32imm_pcrel : Operand<i32>; def i16imm_pcrel : Operand<i16>; @@ -418,7 +426,7 @@ def SSECC : Operand<i8> { } def AVXCC : Operand<i8> { - let PrintMethod = "printSSECC"; + let PrintMethod = "printAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } @@ -499,7 +507,7 @@ def i64i32imm : Operand<i64> { // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand<i64> { - let PrintMethod = "print_pcrel_imm"; + let PrintMethod = "printPCRelImm"; let ParserMatchClass = X86AbsMemAsmOperand; let OperandType = "OPERAND_PCREL"; } @@ -552,17 +560,17 @@ def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; -def UseSSE1 : Predicate<"Subtarget->hasSSE1() && Subtarget->hasNoAVX()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; -def UseSSE2 : Predicate<"Subtarget->hasSSE2() && Subtarget->hasNoAVX()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; -def UseSSE3 : Predicate<"Subtarget->hasSSE3() && Subtarget->hasNoAVX()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; -def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && Subtarget->hasNoAVX()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; -def UseSSE41 : Predicate<"Subtarget->hasSSE41() && Subtarget->hasNoAVX()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; -def UseSSE42 : Predicate<"Subtarget->hasSSE42() && Subtarget->hasNoAVX()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; @@ -581,13 +589,14 @@ def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def HasRTM : Predicate<"Subtarget->hasRTM()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def In32BitMode : Predicate<"!Subtarget->is64Bit()">, - AssemblerPredicate<"!Mode64Bit">; + AssemblerPredicate<"!Mode64Bit", "32-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, - AssemblerPredicate<"Mode64Bit">; + AssemblerPredicate<"Mode64Bit", "64-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; @@ -1034,7 +1043,7 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), */ -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), @@ -1124,24 +1133,26 @@ def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), // perspective, this is pretty bizarre. Make these instructions disassembly // only for now. -def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), - "bt{w}\t{$src2, $src1|$src1, $src2}", -// [(X86bt (loadi16 addr:$src1), GR16:$src2), -// (implicit EFLAGS)] - [], IIC_BT_MR - >, OpSize, TB, Requires<[FastBTMem]>; -def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), - "bt{l}\t{$src2, $src1|$src1, $src2}", -// [(X86bt (loadi32 addr:$src1), GR32:$src2), -// (implicit EFLAGS)] - [], IIC_BT_MR - >, TB, Requires<[FastBTMem]>; -def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "bt{q}\t{$src2, $src1|$src1, $src2}", -// [(X86bt (loadi64 addr:$src1), GR64:$src2), -// (implicit EFLAGS)] - [], IIC_BT_MR - >, TB; +let mayLoad = 1, hasSideEffects = 0 in { + def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi16 addr:$src1), GR16:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, OpSize, TB, Requires<[FastBTMem]>; + def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi32 addr:$src1), GR32:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, TB, Requires<[FastBTMem]>; + def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi64 addr:$src1), GR64:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, TB; +} def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", @@ -1172,7 +1183,7 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), [(set EFLAGS, (X86bt (loadi64 addr:$src1), i64immSExt8:$src2))], IIC_BT_MI>, TB; - +let hasSideEffects = 0 in { def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize, TB; @@ -1180,6 +1191,8 @@ def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1187,6 +1200,8 @@ def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1194,6 +1209,8 @@ def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1201,6 +1218,7 @@ def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, @@ -1209,6 +1227,8 @@ def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1216,6 +1236,8 @@ def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1223,6 +1245,8 @@ def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1230,6 +1254,7 @@ def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, @@ -1238,6 +1263,8 @@ def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1245,6 +1272,8 @@ def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1252,6 +1281,8 @@ def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; + +let mayLoad = 1, mayStore = 1 in { def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1259,6 +1290,8 @@ def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} +} // hasSideEffects = 0 } // Defs = [EFLAGS] @@ -1266,28 +1299,46 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), // Atomic support // - // Atomic swap. These are just normal xchg instructions. But since a memory // operand is referenced, the atomicity is ensured. +multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, + InstrItinClass itin> { + let Constraints = "$val = $dst" in { + def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set + GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin>; + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize; + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>; + def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>; + +// Swap between registers. let Constraints = "$val = $dst" in { -def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), - "xchg{b}\t{$val, $ptr|$ptr, $val}", - [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))], - IIC_XCHG_MEM>; -def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr), - "xchg{w}\t{$val, $ptr|$ptr, $val}", - [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))], - IIC_XCHG_MEM>, - OpSize; -def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr), - "xchg{l}\t{$val, $ptr|$ptr, $val}", - [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))], - IIC_XCHG_MEM>; -def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr), - "xchg{q}\t{$val, $ptr|$ptr, $val}", - [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))], - IIC_XCHG_MEM>; - def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), @@ -1298,6 +1349,7 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; } +// Swap between EAX and other registers. def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), @@ -1458,10 +1510,10 @@ def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), Requires<[In32BitMode]>; // Adjust RPL Field of Segment Selector -def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst), +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, Requires<[In32BitMode]>; -def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst), +def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, Requires<[In32BitMode]>; @@ -1577,26 +1629,26 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, PatFrag ld_frag> { def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, EFLAGS, (OpNode RC:$src))]>, T8, VEX_4V; + [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V; def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, EFLAGS, (OpNode (ld_frag addr:$src)))]>, + [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>, T8, VEX_4V; } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, - X86blsr_flag, loadi32>; + X86blsr, loadi32>; defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, - X86blsr_flag, loadi64>, VEX_W; + X86blsr, loadi64>, VEX_W; defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, - X86blsmsk_flag, loadi32>; + X86blsmsk, loadi32>; defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, - X86blsmsk_flag, loadi64>, VEX_W; + X86blsmsk, loadi64>, VEX_W; defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, - X86blsi_flag, loadi32>; + X86blsi, loadi32>; defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, - X86blsi_flag, loadi64>, VEX_W; + X86blsi, loadi64>, VEX_W; } multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, @@ -1679,6 +1731,8 @@ include "X86Instr3DNow.td" include "X86InstrVMX.td" include "X86InstrSVM.td" +include "X86InstrTSX.td" + // System instructions. include "X86InstrSystem.td" @@ -1856,6 +1910,8 @@ def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; def : InstAlias<"fxch", (XCH_F ST1)>; +def : InstAlias<"fcom", (COM_FST0r ST1)>; +def : InstAlias<"fcomp", (COMP_FST0r ST1)>; def : InstAlias<"fcomi", (COM_FIr ST1)>; def : InstAlias<"fcompi", (COM_FIPr ST1)>; def : InstAlias<"fucom", (UCOM_Fr ST1)>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index bd54858..127af6f 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -207,8 +207,14 @@ def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>; -def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src), - "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>; + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>; let neverHasSideEffects = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 17e91a6..3175324 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -203,9 +203,8 @@ multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, string OpcodeStr, X86MemOperand x86memop, list<dag> pat_rr, list<dag> pat_rm, - bit Is2Addr = 1, - bit rr_hasSideEffects = 0> { - let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in + bit Is2Addr = 1> { + let isCommutable = 1, hasSideEffects = 0 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -218,27 +217,6 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, pat_rm, IIC_DEFAULT, d>; } -/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class -multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC, - string asm, string SSEVer, string FPSizeStr, - X86MemOperand x86memop, PatFrag mem_frag, - Domain d, OpndItins itins, bit Is2Addr = 1> { - def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !if(Is2Addr, - !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (!cast<Intrinsic>( - !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], IIC_DEFAULT, d>; - def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2), - !if(Is2Addr, - !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (!cast<Intrinsic>( - !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>; -} - //===----------------------------------------------------------------------===// // Non-instruction patterns //===----------------------------------------------------------------------===// @@ -481,7 +459,7 @@ def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64, VEX_LIG; // For the disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR32:$src2), "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], @@ -519,7 +497,7 @@ let Constraints = "$src1 = $dst" in { "movsd\t{$src2, $dst|$dst, $src2}">, XD; // For the disassembler - let isCodeGenOnly = 1 in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR32:$src2), "movss\t{$src2, $dst|$dst, $src2}", [], @@ -813,16 +791,16 @@ defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - TB, VEX; + TB, VEX, VEX_L; defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - TB, OpSize, VEX; + TB, OpSize, VEX, VEX_L; defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - TB, VEX; + TB, VEX, VEX_L; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, - TB, OpSize, VEX; + TB, OpSize, VEX, VEX_L; defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, TB; @@ -855,22 +833,22 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX; + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX; + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movups\t{$src, $dst|$dst, $src}", [(store (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX; + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX; + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; // For disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], @@ -890,19 +868,19 @@ let isCodeGenOnly = 1 in { def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } let Predicates = [HasAVX] in { @@ -944,7 +922,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), IIC_SSE_MOVU_P_MR>; // For disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1302,7 +1280,7 @@ let Predicates = [HasAVX] in { (VMOVHPSrm VR128:$src1, addr:$src2)>; // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold + // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time // and the fold opportunity reappears. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1322,7 +1300,7 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold + // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time // and the fold opportunity reappears. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1457,7 +1435,7 @@ defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si{q}\t{$src, $dst|$dst, $src}", + "cvttss2si\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, @@ -1465,26 +1443,43 @@ defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si{q}\t{$src, $dst|$dst, $src}", + "cvttsd2si\t{$src, $dst|$dst, $src}", SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; + // The assembler can recognize rr 64-bit instructions by seeing a rxx // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly // where appropriate to do so. -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, XS, VEX_4V, VEX_LIG; defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS, VEX_4V, VEX_W, VEX_LIG; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, VEX_4V, VEX_LIG; defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, VEX_4V, VEX_W, VEX_LIG; -def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDrr FR64:$dst, FR64:$src1, GR32:$src)>; -def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; let Predicates = [HasAVX] in { @@ -1511,27 +1506,49 @@ defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_32>, XS; defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si{q}\t{$src, $dst|$dst, $src}", + "cvttss2si\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_64>, XS, REX_W; defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, "cvttsd2si\t{$src, $dst|$dst, $src}", SSE_CVT_SD2SI>, XD; defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si{q}\t{$src, $dst|$dst, $src}", + "cvttsd2si\t{$src, $dst|$dst, $src}", SSE_CVT_SD2SI>, XD, REX_W; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, - "cvtsi2ss\t{$src, $dst|$dst, $src}", + "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", SSE_CVT_Scalar>, XS; defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", SSE_CVT_Scalar>, XS, REX_W; defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, - "cvtsi2sd\t{$src, $dst|$dst, $src}", + "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", SSE_CVT_Scalar>, XD; defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", SSE_CVT_Scalar>, XD, REX_W; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm FR64:$dst, i32mem:$src)>; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm FR64:$dst, i32mem:$src)>; + // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). @@ -1566,27 +1583,27 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, } defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, - int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si{l}", + int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si{q}", + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - sdmem, sse_load_f64, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD; + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, - sdmem, sse_load_f64, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W; + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", SSE_CVT_Scalar, 0>, XS, VEX_4V; defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", SSE_CVT_Scalar, 0>, XS, VEX_4V, VEX_W; defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", SSE_CVT_Scalar, 0>, XD, VEX_4V; defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", @@ -1596,13 +1613,13 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, - "cvtsi2ss", SSE_CVT_Scalar>, XS; + "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse2_cvtsi2sd, i32mem, loadi32, - "cvtsi2sd", SSE_CVT_Scalar>, XD; + "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; @@ -1616,40 +1633,40 @@ defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, SSE_CVT_SS2SI_32>, XS, VEX; defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si{q}", SSE_CVT_SS2SI_64>, + "cvttss2si", SSE_CVT_SS2SI_64>, XS, VEX, VEX_W; defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD, VEX; defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si{q}", SSE_CVT_SD2SI>, + "cvttsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_W; defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, ssmem, sse_load_f32, "cvttss2si", SSE_CVT_SS2SI_32>, XS; defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si{q}", SSE_CVT_SS2SI_64>, XS, REX_W; + "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD; defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W; + "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, - ssmem, sse_load_f32, "cvtss2si{l}", + ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, - ssmem, sse_load_f32, "cvtss2si{q}", + ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, - ssmem, sse_load_f32, "cvtss2si{l}", + ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_32>, XS; defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, - ssmem, sse_load_f32, "cvtss2si{q}", + ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_64>, XS, REX_W; defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, @@ -1659,13 +1676,47 @@ defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, VEX, Requires<[HasAVX]>; + TB, VEX, VEX_L, Requires<[HasAVX]>; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, TB, Requires<[UseSSE2]>; +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; + +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; + /// SSE 2 Only // Convert scalar double to scalar single @@ -1814,12 +1865,12 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 VR256:$src))], - IIC_SSE_CVT_PS_RR>, VEX; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX; + IIC_SSE_CVT_PS_RM>, VEX, VEX_L; def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], @@ -1853,7 +1904,7 @@ def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX; + (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L; def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1889,12 +1940,12 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], - IIC_SSE_CVT_PS_RR>, VEX; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX; + IIC_SSE_CVT_PS_RM>, VEX, VEX_L; def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", @@ -1974,7 +2025,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], - IIC_SSE_CVT_PD_RR>, VEX; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2015,12 +2066,12 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, VEX; + IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, VEX; + IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L; } let Predicates = [UseSSE2] in { @@ -2048,11 +2099,11 @@ def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtdq2_pd_256 - (bitconvert (memopv2i64 addr:$src))))]>, VEX; + (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX; + (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L; } let neverHasSideEffects = 1, mayLoad = 1 in @@ -2095,7 +2146,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], - IIC_SSE_CVT_PD_RR>, VEX; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2125,6 +2176,10 @@ let Predicates = [HasAVX] in { (VCVTDQ2PSYrm addr:$src)>; // Match fround and fextend for 128/256-bit conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (VCVTPD2PSXrm addr:$src)>; def : Pat<(v4f32 (fround (v4f64 VR256:$src))), (VCVTPD2PSYrr VR256:$src)>; def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), @@ -2139,7 +2194,12 @@ let Predicates = [HasAVX] in { } let Predicates = [UseSSE2] in { - // Match fextend for 128 conversions + // Match fround and fextend for 128 conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (CVTPD2PSrm addr:$src)>; + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), (CVTPS2PDrr VR128:$src)>; } @@ -2150,7 +2210,7 @@ let Predicates = [UseSSE2] in { // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - Operand CC, SDNode OpNode, ValueType VT, + Operand CC, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, string asm_alt, OpndItins itins> { def rr : SIi8<0xC2, MRMSrcReg, @@ -2296,7 +2356,7 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, - Operand CC, Intrinsic Int, string asm, + Operand CC, Intrinsic Int, string asm, string asm_alt, Domain d> { def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, @@ -2329,11 +2389,11 @@ defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, TB, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", @@ -2403,13 +2463,13 @@ defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, memopv4f32, SSEPackedSingle>, TB, VEX_4V; defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv8f32, SSEPackedSingle>, TB, VEX_4V; + memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L; defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2503,16 +2563,16 @@ defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, TB, VEX_4V, VEX_L; defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; + SSEPackedSingle>, TB, VEX_4V, VEX_L; defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; + SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, @@ -2589,10 +2649,11 @@ let Predicates = [HasAVX] in { "movmskpd", SSEPackedDouble>, TB, OpSize, VEX; defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, - "movmskps", SSEPackedSingle>, TB, VEX; + "movmskps", SSEPackedSingle>, TB, + VEX, VEX_L; defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, "movmskpd", SSEPackedDouble>, TB, - OpSize, VEX; + OpSize, VEX, VEX_L; def : Pat<(i32 (X86fgetsign FR32:$src)), (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -2613,11 +2674,11 @@ let Predicates = [HasAVX] in { OpSize, VEX; def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedSingle>, TB, VEX; + SSEPackedSingle>, TB, VEX, VEX_L; def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, SSEPackedDouble>, TB, - OpSize, VEX; + OpSize, VEX, VEX_L; } defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", @@ -2647,10 +2708,8 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions /// PDI_binop_rm - Simple SSE2 binary operator. multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, - OpndItins itins, - bit IsCommutable = 0, - bit Is2Addr = 1> { + X86MemOperand x86memop, OpndItins itins, + bit IsCommutable, bit Is2Addr> { let isCommutable = IsCommutable in def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), @@ -2669,40 +2728,30 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, } } // ExeDomain = SSEPackedInt -// These are ordered here for pattern ordering requirements with the fp versions +multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, + ValueType OpVT128, ValueType OpVT256, + OpndItins itins, bit IsCommutable = 0> { +let Predicates = [HasAVX] in + defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, + VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; -let Predicates = [HasAVX] in { -defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, + memopv2i64, i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2] in + defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, + OpVT256, VR256, memopv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; } -let Constraints = "$src1 = $dst" in { -defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1>; -defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1>; -defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 1>; -defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64, - i128mem, SSE_BIT_ITINS_P, 0>; -} // Constraints = "$src1 = $dst" +// These are ordered here for pattern ordering requirements with the fp versions -let Predicates = [HasAVX2] in { -defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64, - i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64, - i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64, - i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; -defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64, - i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V; -} +defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>; +defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, + SSE_BIT_ITINS_P, 0>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2747,6 +2796,20 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in /// multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f256mem, + [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L; + + defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f256mem, + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (bc_v4i64 (v4f64 VR256:$src2))))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, + TB, OpSize, VEX_4V, VEX_L; + // In AVX no need to add a pattern for 128-bit logical rr ps, because they // are all promoted to v2i64, and the patterns are covered by the int // version. This is needed in SSE only, because v2i64 isn't supported on @@ -2754,7 +2817,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (memopv2i64 addr:$src2)))], 0, 1>, TB, VEX_4V; + (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, @@ -2763,6 +2826,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (memopv2i64 addr:$src2)))], 0>, TB, OpSize, VEX_4V; + let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, @@ -2779,31 +2843,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, } } -/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms -/// -multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr, - SDNode OpNode> { - defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, - !strconcat(OpcodeStr, "ps"), f256mem, - [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], - [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V; - - defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, - !strconcat(OpcodeStr, "pd"), f256mem, - [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), - (bc_v4i64 (v4f64 VR256:$src2))))], - [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), - (memopv4i64 addr:$src2)))], 0>, - TB, OpSize, VEX_4V; -} - -// AVX 256-bit packed logical ops forms -defm VAND : sse12_fp_packed_logical_y<0x54, "and", and>; -defm VOR : sse12_fp_packed_logical_y<0x56, "or", or>; -defm VXOR : sse12_fp_packed_logical_y<0x57, "xor", xor>; -defm VANDN : sse12_fp_packed_logical_y<0x55, "andn", X86andnp>; - defm AND : sse12_fp_packed_logical<0x54, "and", and>; defm OR : sse12_fp_packed_logical<0x56, "or", or>; defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; @@ -2838,26 +2877,32 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, itins.d, Is2Addr>, XD; } -multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - SizeItins itins, - bit Is2Addr = 1> { +multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, SizeItins itins> { +let Predicates = [HasAVX] in { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR128, v4f32, f128mem, memopv4f32, + SSEPackedSingle, itins.s, 0>, TB, VEX_4V; + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR128, v2f64, f128mem, memopv2f64, + SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, VR256, v8f32, f256mem, memopv8f32, + SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L; + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, VR256, v4f64, f256mem, memopv4f64, + SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, - v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>, - TB; + v4f32, f128mem, memopv4f32, SSEPackedSingle, + itins.s, 1>, TB; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, - v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>, - TB, OpSize; + v2f64, f128mem, memopv2f64, SSEPackedDouble, + itins.d, 1>, TB, OpSize; } - -multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr, - SDNode OpNode, - SizeItins itins> { - defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, - v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>, - TB; - defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, - v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>, - TB, OpSize; } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, @@ -2871,116 +2916,69 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, itins.d, Is2Addr>, XD; } -multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr, - SizeItins itins, - bit Is2Addr = 1> { - defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32, - SSEPackedSingle, itins.s, Is2Addr>, - TB; - - defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64, - SSEPackedDouble, itins.d, Is2Addr>, - TB, OpSize; +// Binary Arithmetic instructions +defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>; +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>; +let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>; + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>; + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>; + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>; } -multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr, - SizeItins itins> { - defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256, - !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32, - SSEPackedSingle, itins.s, 0>, TB; - - defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256, - !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64, - SSEPackedDouble, itins.d, 0>, TB, OpSize; +let isCodeGenOnly = 1 in { + defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>; + defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>; } -// Binary Arithmetic instructions defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; -defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x58, "add", fadd, SSE_ALU_ITINS_P>, - VEX_4V; defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>, VEX_4V, VEX_LIG; -defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x59, "mul", fmul, SSE_MUL_ITINS_P>, - VEX_4V; let isCommutable = 0 in { defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, - VEX_4V; defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, - VEX_4V; defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, - basic_sse12_fp_binop_p_y_int<0x5F, "max", SSE_ALU_ITINS_P>, - VEX_4V; defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>, basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y_int<0x5D, "min", SSE_ALU_ITINS_P>, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, - VEX_4V; } let Constraints = "$src1 = $dst" in { defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, - basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; let isCommutable = 0 in { defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, - basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, - basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P>; + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, - basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P>; + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; } } let isCodeGenOnly = 1 in { defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, VEX_4V; defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; - defm VMINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P, 0>, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, VEX_4V; let Constraints = "$src1 = $dst" in { - defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>; - defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>; + defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; + defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; } } @@ -3011,6 +3009,26 @@ def SSE_RCPS : OpndItins< /// sse1_fp_unop_s - SSE1 unops in scalar form. multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, Intrinsic F32Int, OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), + (ins FR32:$src1, FR32:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + let mayLoad = 1 in { + def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), + (ins FR32:$src1,f32mem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + } +} + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode FR32:$src))]>; @@ -3030,49 +3048,115 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>; } -/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form. -multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { - def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. +multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), + (ins FR32:$src1, FR32:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; let mayLoad = 1 in { - def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), + (ins FR32:$src1,f32mem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + } +} + + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode FR32:$src))]>; + // For scalar unary operations, fold a load into the operation + // only in OptForSize mode. It eliminates an instruction, but it also + // eliminates a whole-register clobber (the load), so it introduces a + // partial register update condition. + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, + Requires<[UseSSE1, OptForSize]>; + let Constraints = "$src1 = $dst" in { + def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), + [], itins.rr>; + let mayLoad = 1, hasSideEffects = 0 in + def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, ssmem:$src2), + !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), + [], itins.rm>; } } /// sse1_fp_unop_p - SSE1 unops in packed form. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], + itins.rr>, VEX; + def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], + itins.rm>, VEX; + def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L; + def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))], + itins.rm>, VEX, VEX_L; +} + def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>; + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>; def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>; } -/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form. -multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { - def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], - itins.rr>; - def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))], - itins.rm>; -} - /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, - Intrinsic V4F32Int, OpndItins itins> { + Intrinsic V4F32Int, Intrinsic V8F32Int, + OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int VR128:$src))], + itins.rr>, VEX; + def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], + itins.rm>, VEX; + def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V8F32Int VR256:$src))], + itins.rr>, VEX, VEX_L; + def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), + (ins f256mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))], + itins.rm>, VEX, VEX_L; +} + def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (V4F32Int VR128:$src))], @@ -3083,22 +3167,29 @@ multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, itins.rm>; } -/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms. -multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, - Intrinsic V4F32Int, OpndItins itins> { - def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V4F32Int VR256:$src))], - itins.rr>; - def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))], - itins.rm>; -} - /// sse2_fp_unop_s - SSE2 unops in scalar form. multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, Intrinsic F64Int, OpndItins itins> { +let Predicates = [HasAVX], hasSideEffects = 0 in { + def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), + (ins FR64:$src1, FR64:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + let mayLoad = 1 in { + def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), + (ins FR64:$src1,f64mem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, sdmem:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX_4V, VEX_LIG; + } +} + def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>; @@ -3115,26 +3206,32 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>; } -/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. -let hasSideEffects = 0 in -multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { - def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - let mayLoad = 1 in { - def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, sdmem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - } -} - /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(!strconcat("v", OpcodeStr), + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], + itins.rr>, VEX; + def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], + itins.rm>, VEX; + def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(!strconcat("v", OpcodeStr), + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L; + def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(!strconcat("v", OpcodeStr), + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))], + itins.rm>, VEX, VEX_L; +} + def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>; @@ -3143,82 +3240,24 @@ multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>; } -/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms. -multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { - def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], - itins.rr>; - def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))], - itins.rm>; -} - -/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms. -multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr, - Intrinsic V2F64Int, OpndItins itins> { - def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V2F64Int VR128:$src))], - itins.rr>; - def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))], - itins.rm>; -} - -/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms. -multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, - Intrinsic V2F64Int, OpndItins itins> { - def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V2F64Int VR256:$src))], - itins.rr>; - def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))], - itins.rm>; -} +// Square root. +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, + SSE_SQRTS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, + SSE_SQRTS>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>; -let Predicates = [HasAVX] in { - // Square root. - defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt">, - sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG; - - defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>, - sse2_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>, - sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>, - sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>, - sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps, - SSE_SQRTP>, - sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd, - SSE_SQRTP>, - sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256, - SSE_SQRTP>, - sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256, - SSE_SQRTP>, - VEX; - - // Reciprocal approximations. Note that these typically require refinement - // in order to obtain suitable precision. - defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG; - defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>, - sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>, - sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256, - SSE_SQRTP>, - sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps, - SSE_SQRTP>, VEX; - - defm VRCP : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG; - defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp, SSE_RCPP>, - sse1_fp_unop_p_y<0x53, "vrcp", X86frcp, SSE_RCPP>, - sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256, - SSE_RCPP>, - sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps, - SSE_RCPP>, VEX; -} +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>, + sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, + int_x86_avx_rsqrt_ps_256, SSE_SQRTP>; +defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, + sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, + int_x86_avx_rcp_ps_256, SSE_RCPP>; def : Pat<(f32 (fsqrt FR32:$src)), (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; @@ -3273,27 +3312,14 @@ let Predicates = [HasAVX] in { (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; } -// Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, - SSE_SQRTS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>, - sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps, SSE_SQRTS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTS>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>, - sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>; - // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss, - SSE_SQRTS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, - sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - SSE_SQRTS>; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss, - SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>; +let Predicates = [UseSSE1] in { + def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), + (RSQRTSSr_Int VR128:$src, VR128:$src)>; + def : Pat<(int_x86_sse_rcp_ss VR128:$src), + (RCPSSr_Int VR128:$src, VR128:$src)>; +} // There is no f64 version of the reciprocal approximation instructions. @@ -3331,20 +3357,20 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_L; def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_L; let ExeDomain = SSEPackedInt in def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_L; } let AddedComplexity = 400 in { // Prefer non-temporal versions @@ -3453,53 +3479,52 @@ def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), VEX; def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX; -} + VEX, VEX_L; def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, VEX; def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX; + VEX, VEX_L; +} // For Disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX; def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, - VEX; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, VEX; def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, - VEX; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } -let canFoldAsLoad = 1, mayLoad = 1 in { +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + neverHasSideEffects = 1 in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, VEX; def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, - VEX; + VEX, VEX_L; let Predicates = [HasAVX] in { def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, XS, VEX; def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, - XS, VEX; + XS, VEX, VEX_L; } } -let mayStore = 1 in { +let mayStore = 1, neverHasSideEffects = 1 in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, @@ -3507,14 +3532,14 @@ def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, - VEX; + VEX, VEX_L; let Predicates = [HasAVX] in { def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, XS, VEX; def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, - XS, VEX; + XS, VEX, VEX_L; } } @@ -3527,7 +3552,7 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; // For Disassembler -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -3537,7 +3562,8 @@ def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; } -let canFoldAsLoad = 1, mayLoad = 1 in { +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + neverHasSideEffects = 1 in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], @@ -3561,25 +3587,17 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), XS, Requires<[UseSSE2]>; } -// Intrinsic forms of MOVDQU load and store -def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], - IIC_SSE_MOVU_P_MR>, - XS, VEX, Requires<[HasAVX]>; - -def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], - IIC_SSE_MOVU_P_MR>, - XS, Requires<[UseSSE2]>; - } // ExeDomain = SSEPackedInt let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), (VMOVDQUYmr addr:$dst, VR256:$src)>; } +let Predicates = [UseSSE2] in +def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (MOVDQUmr addr:$dst, VR128:$src)>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Arithmetic Instructions @@ -3613,6 +3631,24 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, itins.rm>; } +multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + Intrinsic IntId256, OpndItins itins, + bit IsCommutable = 0> { +let Predicates = [HasAVX] in + defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, + VR128, memopv2i64, i128mem, itins, + IsCommutable, 0>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, + i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2] in + defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, + VR256, memopv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; +} + multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, RegisterClass RC, @@ -3642,7 +3678,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>; } -/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types +/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType DstVT, ValueType SrcVT, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, @@ -3665,249 +3701,75 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, } } // ExeDomain = SSEPackedInt -// 128-bit Integer Arithmetic +defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1>; +defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 1>; +defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1>; +defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0>; +defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 0>; +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; -let Predicates = [HasAVX] in { -defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1, 0 /*3addr*/>, - VEX_4V; -defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDQ : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64, - i128mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V; -defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64, - i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64, - i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V; +// Intrinsic forms +defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, + int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>; +defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, + int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>; +defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b, + int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>; +defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w, + int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>; +defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, + int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; +defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, + int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; +defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, + int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>; +defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, + int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>; +defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, + int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; +defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, + int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; +defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, + int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; +defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, + int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>; + +let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; - -// Intrinsic forms -defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, - VR128, memopv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, - VR128, memopv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, - VR128, memopv2i64, i128mem, - SSE_PMADD, 1, 0>, VEX_4V; -defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -} - -let Predicates = [HasAVX2] in { -defm VPADDBY : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDWY : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDDY : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDQY : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64, - i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V; -defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64, - i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPSUBBY : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBWY : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64, - i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64, - i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V; +let Predicates = [HasAVX2] in defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, VR256, memopv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; - -// Intrinsic forms -defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDSWY : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w, - VR256, memopv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w, - VR256, memopv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; -defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd, - VR256, memopv4i64, i256mem, - SSE_PMADD, 1, 0>, VEX_4V; -defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPAVGWY : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMINUBY : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMINSWY : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMAXUBY : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPMAXSWY : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -defm VPSADBWY : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; -} - -let Constraints = "$src1 = $dst" in { -defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1>; -defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1>; -defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P, 1>; -defm PADDQ : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64, - i128mem, SSE_INTALUQ_ITINS_P, 1>; -defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64, - i128mem, SSE_INTMUL_ITINS_P, 1>; -defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P>; -defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P>; -defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64, - i128mem, SSE_INTALU_ITINS_P>; -defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64, - i128mem, SSE_INTALUQ_ITINS_P>; + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; -// Intrinsic forms -defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, - VR128, memopv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1>; -defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, - VR128, memopv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1>; -defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, - VR128, memopv2i64, i128mem, - SSE_PMADD, 1>; -defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; - -} // Constraints = "$src1 = $dst" - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// @@ -3961,30 +3823,30 @@ let ExeDomain = SSEPackedInt in { let Predicates = [HasAVX2] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, VR256, v16i16, v8i16, bc_v8i16, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR256, v8i32, v4i32, bc_v4i32, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR256, v4i64, v2i64, bc_v2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, VR256, v16i16, v8i16, bc_v8i16, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR256, v8i32, v4i32, bc_v4i32, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR256, v4i64, v2i64, bc_v2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, VR256, v16i16, v8i16, bc_v8i16, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR256, v8i32, v4i32, bc_v4i32, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; let ExeDomain = SSEPackedInt in { // 256-bit logical shifts. @@ -3993,13 +3855,13 @@ let ExeDomain = SSEPackedInt in { "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, - VEX_4V; + VEX_4V, VEX_L; def VPSRLDQYri : PDIi8<0x73, MRM3r, (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, - VEX_4V; + VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. } } // Predicates = [HasAVX2] @@ -4089,183 +3951,106 @@ let Predicates = [UseSSE2] in { // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX] in { - defm VPCMPEQB : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v16i8, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPEQW : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v8i16, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPEQD : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v4i32, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPGTB : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v16i8, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; - defm VPCMPGTW : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v8i16, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; - defm VPCMPGTD : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v4i32, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -} - -let Predicates = [HasAVX2] in { - defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; - defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; - defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; - defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -} - -let Constraints = "$src1 = $dst" in { - defm PCMPEQB : PDI_binop_rm<0x74, "pcmpeqb", X86pcmpeq, v16i8, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; - defm PCMPEQW : PDI_binop_rm<0x75, "pcmpeqw", X86pcmpeq, v8i16, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; - defm PCMPEQD : PDI_binop_rm<0x76, "pcmpeqd", X86pcmpeq, v4i32, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 1>; - defm PCMPGTB : PDI_binop_rm<0x64, "pcmpgtb", X86pcmpgt, v16i8, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; - defm PCMPGTW : PDI_binop_rm<0x65, "pcmpgtw", X86pcmpgt, v8i16, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; - defm PCMPGTD : PDI_binop_rm<0x66, "pcmpgtd", X86pcmpgt, v4i32, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -} // Constraints = "$src1 = $dst" +defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1>; +defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1>; +defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1>; +defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0>; +defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0>; +defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX] in { -defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -} - -let Predicates = [HasAVX2] in { -defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -} - -let Constraints = "$src1 = $dst" in { -defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -} // Constraints = "$src1 = $dst" +defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, + int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>; +defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, + int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>; +defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, + int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// let ExeDomain = SSEPackedInt in { -multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, SDNode OpNode> { -def ri : Ii8<0x70, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (vt (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF>; -def mi : Ii8<0x70, MRMSrcMem, - (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, - (vt (OpNode (bitconvert (memopv2i64 addr:$src1)), - (i8 imm:$src2))))], - IIC_SSE_PSHUF>; -} - -multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, SDNode OpNode> { -def Yri : Ii8<0x70, MRMSrcReg, - (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>; -def Ymi : Ii8<0x70, MRMSrcMem, - (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (vt (OpNode (bitconvert (memopv4i64 addr:$src1)), - (i8 imm:$src2))))]>; -} -} // ExeDomain = SSEPackedInt - +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, + SDNode OpNode> { let Predicates = [HasAVX] in { - let AddedComplexity = 5 in - defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, X86PShufd>, TB, OpSize, VEX; - - // SSE2 with ImmT == Imm8 and XS prefix. - defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, X86PShufhw>, XS, VEX; - - // SSE2 with ImmT == Imm8 and XD prefix. - defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, X86PShuflw>, XD, VEX; - - def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), - (VPSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>; + def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>, VEX; + def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX; } let Predicates = [HasAVX2] in { - defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX; - defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX; - defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX; + def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, i8imm:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>, VEX, VEX_L; + def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, i8imm:$src2), + !strconcat(!strconcat("v", OpcodeStr), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L; } let Predicates = [UseSSE2] in { - let AddedComplexity = 5 in - defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize; + def ri : Ii8<0x70, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>; + def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF>; +} +} +} // ExeDomain = SSEPackedInt - // SSE2 with ImmT == Imm8 and XS prefix. - defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, X86PShufhw>, XS; +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; - // SSE2 with ImmT == Imm8 and XD prefix. - defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, X86PShuflw>, XD; +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; +} - def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), - (PSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (PSHUFDri VR128:$src1, imm:$imm)>; +let Predicates = [UseSSE2] in { + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; } //===---------------------------------------------------------------------===// @@ -4328,22 +4113,22 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, - bc_v32i8>, VEX_4V; + bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, - bc_v16i16>, VEX_4V; + bc_v16i16>, VEX_4V, VEX_L; defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, - bc_v8i32>, VEX_4V; + bc_v8i32>, VEX_4V, VEX_L; defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, - bc_v4i64>, VEX_4V; + bc_v4i64>, VEX_4V, VEX_L; defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, - bc_v32i8>, VEX_4V; + bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, - bc_v16i16>, VEX_4V; + bc_v16i16>, VEX_4V, VEX_L; defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, - bc_v8i32>, VEX_4V; + bc_v8i32>, VEX_4V, VEX_L; defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, - bc_v4i64>, VEX_4V; + bc_v4i64>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -4435,9 +4220,9 @@ def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), let Predicates = [HasAVX2] in { def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX; + [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L; def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; + "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; } def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), @@ -4900,9 +4685,9 @@ let Predicates = [HasAVX] in { defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", v4f32, VR128, memopv4f32, f128mem>, VEX; defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v8f32, VR256, memopv8f32, f256mem>, VEX; + v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v8f32, VR256, memopv8f32, f256mem>, VEX; + v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; } defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, memopv4f32, f128mem>; @@ -4970,7 +4755,7 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), let Predicates = [HasAVX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; - defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; @@ -5019,7 +4804,8 @@ let Predicates = [HasAVX] in { [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX; + [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, + VEX, VEX_L; } def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", @@ -5052,13 +4838,13 @@ let Predicates = [HasAVX] in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; + f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; + f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { @@ -5113,9 +4899,9 @@ let Predicates = [HasAVX] in { defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, X86fhsub, 0>, VEX_4V; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V, VEX_L; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, @@ -5123,9 +4909,9 @@ let Predicates = [HasAVX] in { defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, X86fhsub, 0>, VEX_4V; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V, VEX_L; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V, VEX_L; } } @@ -5191,11 +4977,11 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", - int_x86_avx2_pabs_b>, VEX; + int_x86_avx2_pabs_b>, VEX, VEX_L; defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", - int_x86_avx2_pabs_w>, VEX; + int_x86_avx2_pabs_w>, VEX, VEX_L; defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", - int_x86_avx2_pabs_d>, VEX; + int_x86_avx2_pabs_d>, VEX, VEX_L; } defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", @@ -5334,37 +5120,37 @@ let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, memopv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", - int_x86_avx2_phadd_sw>, VEX_4V; + int_x86_avx2_phadd_sw>, VEX_4V, VEX_L; defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", - int_x86_avx2_phsub_sw>, VEX_4V; + int_x86_avx2_phsub_sw>, VEX_4V, VEX_L; defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", - int_x86_avx2_pmadd_ub_sw>, VEX_4V; + int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L; } defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", - int_x86_avx2_pmul_hr_sw>, VEX_4V; + int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L; } // None of these have i8 immediate fields. @@ -5443,7 +5229,7 @@ multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> { let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Predicates = [HasAVX2] in - defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V; + defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palign<"palignr">; @@ -5550,17 +5336,17 @@ defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, let Predicates = [HasAVX2] in { defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", - int_x86_avx2_pmovsxbw>, VEX; + int_x86_avx2_pmovsxbw>, VEX, VEX_L; defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", - int_x86_avx2_pmovsxwd>, VEX; + int_x86_avx2_pmovsxwd>, VEX, VEX_L; defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", - int_x86_avx2_pmovsxdq>, VEX; + int_x86_avx2_pmovsxdq>, VEX, VEX_L; defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", - int_x86_avx2_pmovzxbw>, VEX; + int_x86_avx2_pmovzxbw>, VEX, VEX_L; defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", - int_x86_avx2_pmovzxwd>, VEX; + int_x86_avx2_pmovzxwd>, VEX, VEX_L; defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", - int_x86_avx2_pmovzxdq>, VEX; + int_x86_avx2_pmovzxdq>, VEX, VEX_L; } defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; @@ -5576,31 +5362,43 @@ let Predicates = [HasAVX] in { (VPMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), (VPMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), (VPMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), (VPMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), (VPMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), (VPMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), (VPMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), (VPMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), (VPMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), (VPMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), (VPMOVZXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), (VPMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVZXDQrm addr:$src)>; } let Predicates = [UseSSE41] in { @@ -5609,31 +5407,43 @@ let Predicates = [UseSSE41] in { (PMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), (PMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), + (PMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), (PMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), (PMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), + (PMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), (PMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), (PMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), + (PMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), (PMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), (PMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), + (PMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), (PMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), (PMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), + (PMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), (PMOVZXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), (PMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), + (PMOVZXDQrm addr:$src)>; } let Predicates = [HasAVX2] in { @@ -5697,13 +5507,13 @@ defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, let Predicates = [HasAVX2] in { defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", - int_x86_avx2_pmovsxbd>, VEX; + int_x86_avx2_pmovsxbd>, VEX, VEX_L; defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", - int_x86_avx2_pmovsxwq>, VEX; + int_x86_avx2_pmovsxwq>, VEX, VEX_L; defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", - int_x86_avx2_pmovzxbd>, VEX; + int_x86_avx2_pmovzxbd>, VEX, VEX_L; defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", - int_x86_avx2_pmovzxwq>, VEX; + int_x86_avx2_pmovzxwq>, VEX, VEX_L; } defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; @@ -5772,13 +5582,38 @@ defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, } let Predicates = [HasAVX2] in { defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", - int_x86_avx2_pmovsxbq>, VEX; + int_x86_avx2_pmovsxbq>, VEX, VEX_L; defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", - int_x86_avx2_pmovzxbq>, VEX; + int_x86_avx2_pmovzxbq>, VEX, VEX_L; } defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; +let Predicates = [HasAVX2] in { + def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXWDYrm addr:$src)>; + def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXDQYrm addr:$src)>; + + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBQYrm addr:$src)>; +} + let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq @@ -5803,6 +5638,157 @@ let Predicates = [UseSSE41] in { (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; + + def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; + + def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; + + def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))), + (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))), + (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))), + (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))), + (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (VPMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), + (VPMOVZXDQrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (VPMOVSXBQrm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; + + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXBWrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), + (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXWQrm addr:$src)>; + + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVZXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), + (PMOVZXDQrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -6108,6 +6094,7 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, Intrinsic F64Int, bit Is2Addr = 1> { let ExeDomain = GenericDomain in { // Operation, reg. + let hasSideEffects = 0 in def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6141,6 +6128,7 @@ let ExeDomain = GenericDomain in { OpSize; // Operation, reg. + let hasSideEffects = 0 in def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6185,7 +6173,7 @@ let Predicates = [HasAVX] in { defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, memopv8f32, memopv4f64, int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX; + int_x86_avx_round_pd_256>, VEX, VEX_L; defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; @@ -6213,12 +6201,47 @@ let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x2))>; + def : Pat<(v4f32 (frint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x3))>; + def : Pat<(v2f64 (ffloor VR128:$src)), (VROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x2))>; + def : Pat<(v2f64 (frint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x3))>; + def : Pat<(v8f32 (ffloor VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0x1))>; + def : Pat<(v8f32 (fnearbyint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0xC))>; + def : Pat<(v8f32 (fceil VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x2))>; + def : Pat<(v8f32 (frint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x4))>; + def : Pat<(v8f32 (ftrunc VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x3))>; + def : Pat<(v4f64 (ffloor VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0x1))>; + def : Pat<(v4f64 (fnearbyint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0xC))>; + def : Pat<(v4f64 (fceil VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x2))>; + def : Pat<(v4f64 (frint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x4))>; + def : Pat<(v4f64 (ftrunc VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x3))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6252,8 +6275,25 @@ let Predicates = [UseSSE41] in { def : Pat<(v4f32 (ffloor VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x2))>; + def : Pat<(v4f32 (frint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x3))>; + def : Pat<(v2f64 (ffloor VR128:$src)), (ROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x2))>; + def : Pat<(v2f64 (frint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x3))>; } //===----------------------------------------------------------------------===// @@ -6275,11 +6315,11 @@ def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, - OpSize, VEX; + OpSize, VEX, VEX_L; def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>, - OpSize, VEX; + OpSize, VEX, VEX_L; } let Defs = [EFLAGS] in { @@ -6308,11 +6348,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, let Defs = [EFLAGS], Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; -defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>, + VEX_L; } let ExeDomain = SSEPackedDouble in { defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; -defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>, + VEX_L; } } @@ -6408,67 +6450,6 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; } -let Predicates = [HasAVX] in { - let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0>, VEX_4V; - defm VPMINSB : SS41I_binop_rm_int<0x38, "vpminsb", int_x86_sse41_pminsb, - 0>, VEX_4V; - defm VPMINSD : SS41I_binop_rm_int<0x39, "vpminsd", int_x86_sse41_pminsd, - 0>, VEX_4V; - defm VPMINUD : SS41I_binop_rm_int<0x3B, "vpminud", int_x86_sse41_pminud, - 0>, VEX_4V; - defm VPMINUW : SS41I_binop_rm_int<0x3A, "vpminuw", int_x86_sse41_pminuw, - 0>, VEX_4V; - defm VPMAXSB : SS41I_binop_rm_int<0x3C, "vpmaxsb", int_x86_sse41_pmaxsb, - 0>, VEX_4V; - defm VPMAXSD : SS41I_binop_rm_int<0x3D, "vpmaxsd", int_x86_sse41_pmaxsd, - 0>, VEX_4V; - defm VPMAXUD : SS41I_binop_rm_int<0x3F, "vpmaxud", int_x86_sse41_pmaxud, - 0>, VEX_4V; - defm VPMAXUW : SS41I_binop_rm_int<0x3E, "vpmaxuw", int_x86_sse41_pmaxuw, - 0>, VEX_4V; - defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, - 0>, VEX_4V; -} - -let Predicates = [HasAVX2] in { - let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw>, VEX_4V; - defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb", - int_x86_avx2_pmins_b>, VEX_4V; - defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd", - int_x86_avx2_pmins_d>, VEX_4V; - defm VPMINUD : SS41I_binop_rm_int_y<0x3B, "vpminud", - int_x86_avx2_pminu_d>, VEX_4V; - defm VPMINUW : SS41I_binop_rm_int_y<0x3A, "vpminuw", - int_x86_avx2_pminu_w>, VEX_4V; - defm VPMAXSB : SS41I_binop_rm_int_y<0x3C, "vpmaxsb", - int_x86_avx2_pmaxs_b>, VEX_4V; - defm VPMAXSD : SS41I_binop_rm_int_y<0x3D, "vpmaxsd", - int_x86_avx2_pmaxs_d>, VEX_4V; - defm VPMAXUD : SS41I_binop_rm_int_y<0x3F, "vpmaxud", - int_x86_avx2_pmaxu_d>, VEX_4V; - defm VPMAXUW : SS41I_binop_rm_int_y<0x3E, "vpmaxuw", - int_x86_avx2_pmaxu_w>, VEX_4V; - defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", - int_x86_avx2_pmul_dq>, VEX_4V; -} - -let Constraints = "$src1 = $dst" in { - let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; - defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb", int_x86_sse41_pminsb>; - defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd", int_x86_sse41_pminsd>; - defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud", int_x86_sse41_pminud>; - defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw", int_x86_sse41_pminuw>; - defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb", int_x86_sse41_pmaxsb>; - defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd", int_x86_sse41_pmaxsd>; - defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud", int_x86_sse41_pmaxud>; - defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw", int_x86_sse41_pmaxuw>; - defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; -} /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -6492,6 +6473,76 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, } let Predicates = [HasAVX] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, + 0>, VEX_4V; + defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, + 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", + int_x86_avx2_packusdw>, VEX_4V, VEX_L; + defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", + int_x86_avx2_pmul_dq>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in + defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; + defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, + memopv2i64, i128mem>; + defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, + memopv2i64, i128mem>; + defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128, + memopv2i64, i128mem>; + defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128, + memopv2i64, i128mem>; + defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128, + memopv2i64, i128mem>; + defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128, + memopv2i64, i128mem>; + defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128, + memopv2i64, i128mem>; + defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, + memopv2i64, i128mem>; + defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; +} + +let Predicates = [HasAVX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, memopv2i64, i128mem, 0>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, @@ -6499,9 +6550,9 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V; + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - memopv4i64, i256mem, 0>, VEX_4V; + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -6544,13 +6595,15 @@ let Predicates = [HasAVX] in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, 0>, VEX_4V; defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V; + int_x86_avx_blend_ps_256, VR256, memopv8f32, + f256mem, 0>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, VR128, memopv2f64, f128mem, 0>, VEX_4V; defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V; + int_x86_avx_blend_pd_256,VR256, memopv4f64, + f256mem, 0>, VEX_4V, VEX_L; } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, 0>, VEX_4V; @@ -6565,15 +6618,15 @@ let Predicates = [HasAVX] in { VR128, memopv2f64, f128mem, 0>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, memopv8f32, i256mem, 0>, VEX_4V; + VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, memopv4i64, i256mem, 0>, VEX_4V; + VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, memopv4i64, i256mem, 0>, VEX_4V; + VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; } } @@ -6624,13 +6677,13 @@ let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, memopv2f64, int_x86_sse41_blendvpd>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - memopv4f64, int_x86_avx_blendv_pd_256>; + memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, memopv4f32, int_x86_sse41_blendvps>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - memopv8f32, int_x86_avx_blendv_ps_256>; + memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, memopv2i64, int_x86_sse41_pblendvb>; @@ -6638,7 +6691,7 @@ defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - memopv4i64, int_x86_avx2_pblendvb>; + memopv4i64, int_x86_avx2_pblendvb>, VEX_L; } let Predicates = [HasAVX] in { @@ -6670,31 +6723,31 @@ let Predicates = [HasAVX] in { (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2), + def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), (imm:$mask))), - (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>; - def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; + def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), (imm:$mask))), - (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>; + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; - def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), (imm:$mask))), - (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), (imm:$mask))), - (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), (imm:$mask))), - (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2), + (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>; + def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), (imm:$mask))), - (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>; + (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } /// SS41I_ternary_int - SSE 4.1 ternary operator @@ -6758,15 +6811,15 @@ let Predicates = [UseSSE41] in { (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), (imm:$mask))), - (PBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), (imm:$mask))), - (BLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), (imm:$mask))), - (BLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } @@ -6779,7 +6832,7 @@ let Predicates = [HasAVX2] in def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, - OpSize, VEX; + OpSize, VEX, VEX_L; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -6815,7 +6868,7 @@ let Predicates = [HasAVX] in let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - memopv4i64, i256mem, 0>, VEX_4V; + memopv4i64, i256mem, 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, @@ -6833,8 +6886,8 @@ multiclass pseudo_pcmpistrm<string asm> { imm:$src3))]>; def MEM : PseudoI<(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 - VR128:$src1, (load addr:$src2), imm:$src3))]>; + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, + (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; } let Defs = [EFLAGS], usesCustomInserter = 1 in { @@ -6842,24 +6895,22 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in { defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; } -let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in { - def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), - "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; +multiclass pcmpistrm_SS42AI<string asm> { + def rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; let mayLoad = 1 in - def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; + def rm :SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; } let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { - def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), - "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; - let mayLoad = 1 in - def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; + let Predicates = [HasAVX] in + defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; } // Packed Compare Explicit Length Strings, Return Mask @@ -6870,8 +6921,8 @@ multiclass pseudo_pcmpestrm<string asm> { VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; def MEM : PseudoI<(outs VR128:$dst), (ins VR128:$src1, i128mem:$src3, i8imm:$src5), - [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 - VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>; + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, + (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>; } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { @@ -6879,64 +6930,94 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; } -let Predicates = [HasAVX], - Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { - def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), - "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; +multiclass SS42AI_pcmpestrm<string asm> { + def rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; let mayLoad = 1 in - def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), - "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; + def rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; } let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { - def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), - "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; - let mayLoad = 1 in - def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), - "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; + let Predicates = [HasAVX] in + defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; } // Packed Compare Implicit Length Strings, Return Index -let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { - multiclass SS42AI_pcmpistri<string asm> { - def rr : SS42AI<0x63, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), - !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; - let mayLoad = 1 in - def rm : SS42AI<0x63, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), - []>, OpSize; - } +multiclass pseudo_pcmpistri<string asm> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + [(set GR32:$dst, EFLAGS, + (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, + (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; } -let Predicates = [HasAVX] in -defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; -defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>; + defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpistri<string asm> { + def rr : SS42AI<0x63, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; + let mayLoad = 1 in + def rm : SS42AI<0x63, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, OpSize; +} + +let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; + defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; +} // Packed Compare Explicit Length Strings, Return Index -let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { - multiclass SS42AI_pcmpestri<string asm> { - def rr : SS42AI<0x61, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), - !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; - let mayLoad = 1 in - def rm : SS42AI<0x61, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), - !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), - []>, OpSize; - } +multiclass pseudo_pcmpestri<string asm> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX, + imm:$src5))]>; } -let Predicates = [HasAVX] in -defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; -defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>; + defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpestri<string asm> { + def rr : SS42AI<0x61, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; + let mayLoad = 1 in + def rm : SS42AI<0x61, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, OpSize; +} + +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { + let Predicates = [HasAVX] in + defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; + defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; +} //===----------------------------------------------------------------------===// // SSE4.2 - CRC Instructions @@ -7227,27 +7308,27 @@ let ExeDomain = SSEPackedSingle in { def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, int_x86_avx_vbroadcast_ss>; def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcast_ss_256>; + int_x86_avx_vbroadcast_ss_256>, VEX_L; } let ExeDomain = SSEPackedDouble in def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256>; + int_x86_avx_vbroadcast_sd_256>, VEX_L; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256>; + int_x86_avx_vbroadcastf128_pd_256>, VEX_L; let ExeDomain = SSEPackedSingle in { def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, int_x86_avx2_vbroadcast_ss_ps>; def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256>; + int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L; } let ExeDomain = SSEPackedDouble in def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256>; + int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L; let Predicates = [HasAVX2] in def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128>; + int_x86_avx2_vbroadcasti128>, VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), @@ -7261,12 +7342,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + []>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f128mem:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + []>, VEX_4V, VEX_L; } let Predicates = [HasAVX] in { @@ -7335,12 +7416,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX; + []>, VEX, VEX_L; let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), (ins f128mem:$dst, VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, VEX; + []>, VEX, VEX_L; } // AVX1 patterns @@ -7415,7 +7496,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, (ins VR256:$src1, f256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V; + VEX_4V, VEX_L; def mr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -7423,7 +7504,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V; + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedSingle in @@ -7471,13 +7552,13 @@ let ExeDomain = SSEPackedSingle in { defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>; + memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>; defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>; + memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; } let Predicates = [HasAVX] in { @@ -7505,12 +7586,12 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V; + (i8 imm:$src3))))]>, VEX_4V, VEX_L; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V; + (i8 imm:$src3)))]>, VEX_4V, VEX_L; } let Predicates = [HasAVX] in { @@ -7587,9 +7668,9 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { let Predicates = [HasAVX, HasF16C] in { defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; - defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; - defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; } //===----------------------------------------------------------------------===// @@ -7621,9 +7702,16 @@ let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, VR128, memopv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, memopv4i64, i256mem>; + VR256, memopv4i64, i256mem>, VEX_L; } +def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), + imm:$mask)), + (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; +def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), + imm:$mask)), + (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; + //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the // destination operand @@ -7640,11 +7728,12 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int256 VR128:$src))]>, VEX; + [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; + (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, + VEX, VEX_L; } defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, @@ -7779,7 +7868,8 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, VEX_4V; + (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, + VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, @@ -7787,7 +7877,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - VEX_4V; + VEX_4V, VEX_L; } defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>; @@ -7801,14 +7891,15 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, VEX; + (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), - (i8 imm:$src2))))]>, VEX; + (i8 imm:$src2))))]>, VEX, VEX_L; } defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W; @@ -7822,12 +7913,12 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V; + (i8 imm:$src3))))]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V; + (i8 imm:$src3)))]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), @@ -7856,12 +7947,12 @@ let neverHasSideEffects = 1 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + []>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i128mem:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + []>, VEX_4V, VEX_L; } let Predicates = [HasAVX2] in { @@ -7911,11 +8002,12 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, - VEX; + VEX, VEX_L; let neverHasSideEffects = 1, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), - "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX, VEX_L; let Predicates = [HasAVX2] in { def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), @@ -7966,7 +8058,8 @@ multiclass avx2_pmovmask<string OpcodeStr, def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, VEX_4V; + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L; def mr : AVX28I<0x8e, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -7974,7 +8067,7 @@ multiclass avx2_pmovmask<string OpcodeStr, def Ymr : AVX28I<0x8e, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V; + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; } defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", @@ -8012,14 +8105,14 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, - VEX_4V; + VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>, - VEX_4V; + VEX_4V, VEX_L; } defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index bdeb63f..1185941 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -51,6 +51,7 @@ def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), // NOTE: We don't include patterns for shifts of a register by one, because // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). +let hasSideEffects = 0 in { def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), "shl{b}\t$dst", [], IIC_SR>; def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), @@ -59,8 +60,9 @@ def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), "shl{l}\t$dst", [], IIC_SR>; def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), "shl{q}\t$dst", [], IIC_SR>; +} // hasSideEffects = 0 } // isConvertibleToThreeAddress = 1 -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst" // FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern @@ -333,6 +335,7 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), // Rotate instructions //===----------------------------------------------------------------------===// +let hasSideEffects = 0 in { let Constraints = "$src1 = $dst" in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; @@ -455,6 +458,7 @@ def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; } +} // hasSideEffects = 0 let Constraints = "$src1 = $dst" in { // FIXME: provide shorter instructions when imm8 == 1 @@ -839,6 +843,16 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, } // Defs = [EFLAGS] +def ROT32L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. + return getI8Imm(32 - N->getZExtValue()); +}]>; + +def ROT64L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 64-bit integer. + return getI8Imm(64 - N->getZExtValue()); +}]>; + multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { let neverHasSideEffects = 1 in { def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), @@ -873,4 +887,72 @@ let Predicates = [HasBMI2] in { defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W; defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize; defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W; + + // Prefer RORX which is non-destructive and doesn't update EFLAGS. + let AddedComplexity = 10 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; + } + + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not + // immedidate shift, i.e. the following code is considered better + // + // mov %edi, %esi + // shl $imm, %esi + // ... %edi, ... + // + // than + // + // movb $imm, %sil + // shlx %sil, %edi, %esi + // ... %edi, ... + // + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, GR8:$src2), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, GR8:$src2), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, GR8:$src2), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, GR8:$src2), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, GR8:$src2), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, GR8:$src2), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor + // + // mov (%ecx), %esi + // shl $imm, $esi + // + // over + // + // movb $imm %al + // shlx %al, (%ecx), %esi + // + // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole + // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. } diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td new file mode 100644 index 0000000..ad55058 --- /dev/null +++ b/lib/Target/X86/X86InstrTSX.td @@ -0,0 +1,32 @@ +//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel TSX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TSX instructions + +let usesCustomInserter = 1 in +def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), + "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, + Requires<[HasRTM]>; + +let isBranch = 1, isTerminator = 1, Defs = [EAX] in +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst), + "xbegin\t$dst", []>; + +def XEND : I<0x01, MRM_D5, (outs), (ins), + "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; + +def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), + "xabort\t$imm", + [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 8ec2c68..2aa08fa 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -75,10 +75,10 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int VR256:$src))]>, VEX; + [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L; def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L; } let isAsmParserOnly = 1 in { @@ -238,7 +238,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>, - VEX_4V, VEX_I8IMM; + VEX_4V, VEX_I8IMM, VEX_L; def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i256mem:$src3), !strconcat(OpcodeStr, @@ -246,7 +246,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, (bitconvert (memopv4i64 addr:$src3))))]>, - VEX_4V, VEX_I8IMM, VEX_W, MemOp4; + VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -254,7 +254,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { [(set VR256:$dst, (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)), VR256:$src3))]>, - VEX_4V, VEX_I8IMM; + VEX_4V, VEX_I8IMM, VEX_L; } let isAsmParserOnly = 1 in { @@ -287,20 +287,21 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>; + (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, - VEX_W, MemOp4; + VEX_W, MemOp4, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, - (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>; + (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>, + VEX_L; } defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index 764aa5d..cca391f 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -16,7 +16,7 @@ #include "X86Relocations.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/Function.h" +#include "llvm/IR/Function.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Valgrind.h" diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h index d7c08df..f916327 100644 --- a/lib/Target/X86/X86JITInfo.h +++ b/lib/Target/X86/X86JITInfo.h @@ -14,8 +14,8 @@ #ifndef X86JITINFO_H #define X86JITINFO_H -#include "llvm/Function.h" #include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/IR/Function.h" #include "llvm/Target/TargetJITInfo.h" namespace llvm { diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 1c2ef25..5a1e1b8 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -12,23 +12,48 @@ // //===----------------------------------------------------------------------===// -#include "X86MCInstLower.h" #include "X86AsmPrinter.h" -#include "X86COFFMachineModuleInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" -#include "llvm/Type.h" +#include "X86COFFMachineModuleInfo.h" +#include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Target/Mangler.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/Target/Mangler.h" using namespace llvm; +namespace { + +/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. +class X86MCInstLower { + MCContext &Ctx; + Mangler *Mang; + const MachineFunction &MF; + const TargetMachine &TM; + const MCAsmInfo &MAI; + X86AsmPrinter &AsmPrinter; +public: + X86MCInstLower(Mangler *mang, const MachineFunction &MF, + X86AsmPrinter &asmprinter); + + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + +private: + MachineModuleInfoMachO &getMachOMMI() const; +}; + +} // end anonymous namespace + X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf, X86AsmPrinter &asmprinter) : Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()), @@ -43,15 +68,11 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference"); + assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; - if (!MO.isGlobal()) { - assert(MO.isSymbol()); - Name += MAI.getGlobalPrefix(); - Name += MO.getSymbolName(); - } else { + if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); bool isImplicitlyPrivate = false; if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB || @@ -61,6 +82,11 @@ GetSymbolFromOperand(const MachineOperand &MO) const { isImplicitlyPrivate = true; Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate); + } else if (MO.isSymbol()) { + Name += MAI.getGlobalPrefix(); + Name += MO.getSymbolName(); + } else if (MO.isMBB()) { + Name += MO.getMBB()->getSymbol()->getName(); } // If the target flags on the operand changes the name of the symbol, do that @@ -191,7 +217,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, if (Expr == 0) Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); - if (!MO.isJTI() && MO.getOffset()) + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); @@ -324,9 +350,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::CreateImm(MO.getImm()); break; case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( - MO.getMBB()->getSymbol(), Ctx)); - break; case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); @@ -371,10 +394,6 @@ ReSimplify: case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break; case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break; - case X86::SETB_C8r: LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break; - case X86::SETB_C16r: LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break; - case X86::SETB_C32r: LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break; - case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; @@ -531,18 +550,14 @@ ReSimplify: OutMI.setOpcode(X86::RET); break; - case X86::MORESTACK_RET_RESTORE_R10: { - MCInst retInst; - + case X86::MORESTACK_RET_RESTORE_R10: OutMI.setOpcode(X86::MOV64rr); OutMI.addOperand(MCOperand::CreateReg(X86::R10)); OutMI.addOperand(MCOperand::CreateReg(X86::RAX)); - retInst.setOpcode(X86::RET); - AsmPrinter.OutStreamer.EmitInstruction(retInst); + AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); break; } - } } static void LowerTlsAddr(MCStreamer &OutStreamer, @@ -556,11 +571,8 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCContext &context = OutStreamer.getContext(); - if (needsPadding) { - MCInst prefix; - prefix.setOpcode(X86::DATA16_PREFIX); - OutStreamer.EmitInstruction(prefix); - } + if (needsPadding) + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { @@ -610,20 +622,11 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, OutStreamer.EmitInstruction(LEA); if (needsPadding) { - MCInst prefix; - prefix.setOpcode(X86::DATA16_PREFIX); - OutStreamer.EmitInstruction(prefix); - prefix.setOpcode(X86::DATA16_PREFIX); - OutStreamer.EmitInstruction(prefix); - prefix.setOpcode(X86::REX64_PREFIX); - OutStreamer.EmitInstruction(prefix); + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX)); } - MCInst call; - if (is64Bits) - call.setOpcode(X86::CALL64pcrel32); - else - call.setOpcode(X86::CALLpcrel32); StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name); const MCSymbolRefExpr *tlsRef = @@ -631,8 +634,9 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCSymbolRefExpr::VK_PLT, context); - call.addOperand(MCOperand::CreateExpr(tlsRef)); - OutStreamer.EmitInstruction(call); + OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 + : X86::CALLpcrel32) + .addExpr(tlsRef)); } void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { @@ -676,7 +680,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return LowerTlsAddr(OutStreamer, MCInstLowering, *MI); case X86::MOVPC32r: { - MCInst TmpInst; // This is a pseudo op for a two instruction sequence with a label, which // looks like: // call "L1$pb" @@ -685,20 +688,17 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // Emit the call. MCSymbol *PICBase = MF->getPICBaseSymbol(); - TmpInst.setOpcode(X86::CALLpcrel32); // FIXME: We would like an efficient form for this, so we don't have to do a // lot of extra uniquing. - TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase, - OutContext))); - OutStreamer.EmitInstruction(TmpInst); + OutStreamer.EmitInstruction(MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); // Emit the label. OutStreamer.EmitLabel(PICBase); // popl $reg - TmpInst.setOpcode(X86::POP32r); - TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg()); - OutStreamer.EmitInstruction(TmpInst); + OutStreamer.EmitInstruction(MCInstBuilder(X86::POP32r) + .addReg(MI->getOperand(0).getReg())); return; } @@ -728,12 +728,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), DotExpr, OutContext); - MCInst TmpInst; - TmpInst.setOpcode(X86::ADD32ri); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); - TmpInst.addOperand(MCOperand::CreateExpr(DotExpr)); - OutStreamer.EmitInstruction(TmpInst); + OutStreamer.EmitInstruction(MCInstBuilder(X86::ADD32ri) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(DotExpr)); return; } } diff --git a/lib/Target/X86/X86MCInstLower.h b/lib/Target/X86/X86MCInstLower.h deleted file mode 100644 index b4d4cfd..0000000 --- a/lib/Target/X86/X86MCInstLower.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- X86MCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef X86_MCINSTLOWER_H -#define X86_MCINSTLOWER_H - -#include "llvm/Support/Compiler.h" - -namespace llvm { - class MCAsmInfo; - class MCContext; - class MCInst; - class MCOperand; - class MCSymbol; - class MachineInstr; - class MachineFunction; - class MachineModuleInfoMachO; - class MachineOperand; - class Mangler; - class TargetMachine; - class X86AsmPrinter; - -/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. -class LLVM_LIBRARY_VISIBILITY X86MCInstLower { - MCContext &Ctx; - Mangler *Mang; - const MachineFunction &MF; - const TargetMachine &TM; - const MCAsmInfo &MAI; - X86AsmPrinter &AsmPrinter; -public: - X86MCInstLower(Mangler *mang, const MachineFunction &MF, - X86AsmPrinter &asmprinter); - - void Lower(const MachineInstr *MI, MCInst &OutMI) const; - - MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; - MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; - -private: - MachineModuleInfoMachO &getMachOMMI() const; -}; - -} - -#endif diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp new file mode 100644 index 0000000..c22872f --- /dev/null +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -0,0 +1,171 @@ +//===-------- X86PadShortFunction.cpp - pad short functions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will pad short functions to prevent +// a stall if a function returns before the return address is ready. This +// is needed for some Intel Atom processors. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#define DEBUG_TYPE "x86-pad-short-functions" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +STATISTIC(NumBBsPadded, "Number of basic blocks padded"); + +namespace { + struct PadShortFunc : public MachineFunctionPass { + static char ID; + PadShortFunc() : MachineFunctionPass(ID) + , Threshold(4), TM(0), TII(0) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "X86 Atom pad short functions"; + } + + private: + void findReturns(MachineBasicBlock *MBB, + unsigned int Cycles = 0); + + bool cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles, + MachineBasicBlock::iterator *Location = 0); + + void addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd); + + const unsigned int Threshold; + DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs; + + const TargetMachine *TM; + const TargetInstrInfo *TII; + }; + + char PadShortFunc::ID = 0; +} + +FunctionPass *llvm::createX86PadShortFunctions() { + return new PadShortFunc(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// NOOP instructions before early exits. +bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { + bool OptForSize = MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + if (OptForSize) + return false; + + TM = &MF.getTarget(); + TII = TM->getInstrInfo(); + + // Search through basic blocks and mark the ones that have early returns + ReturnBBs.clear(); + findReturns(MF.begin()); + + bool MadeChange = false; + + MachineBasicBlock::iterator ReturnLoc; + MachineBasicBlock *MBB; + unsigned int Cycles = 0; + unsigned int BBCycles; + + // Pad the identified basic blocks with NOOPs + for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); + I != ReturnBBs.end(); ++I) { + MBB = I->first; + Cycles = I->second; + + if (Cycles < Threshold) { + if (!cyclesUntilReturn(MBB, BBCycles, &ReturnLoc)) + continue; + + addPadding(MBB, ReturnLoc, Threshold - Cycles); + NumBBsPadded++; + MadeChange = true; + } + } + + return MadeChange; +} + +/// findReturn - Starting at MBB, follow control flow and add all +/// basic blocks that contain a return to ReturnBBs. +void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { + // If this BB has a return, note how many cycles it takes to get there. + bool hasReturn = cyclesUntilReturn(MBB, Cycles); + if (Cycles >= Threshold) + return; + + if (hasReturn) { + ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + return; + } + + // Follow branches in BB and look for returns + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(); + I != MBB->succ_end(); ++I) { + findReturns(*I, Cycles); + } +} + +/// cyclesUntilReturn - if the MBB has a return instruction, set Location +/// to the instruction and return true. Return false otherwise. +/// Cycles will be incremented by the number of cycles taken to reach the +/// return or the end of the BB, whichever occurs first. +bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles, + MachineBasicBlock::iterator *Location) { + for (MachineBasicBlock::iterator MBBI = MBB->begin(); + MBBI != MBB->end(); ++MBBI) { + MachineInstr *MI = MBBI; + // Mark basic blocks with a return instruction. Calls to other + // functions do not count because the called function will be padded, + // if necessary. + if (MI->isReturn() && !MI->isCall()) { + if (Location) + *Location = MBBI; + return true; + } + + Cycles += TII->getInstrLatency(TM->getInstrItineraryData(), MI); + } + + return false; +} + +/// addPadding - Add the given number of NOOP instructions to the function +/// just prior to the return at MBBI +void PadShortFunc::addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd) { + DebugLoc DL = MBBI->getDebugLoc(); + + while (NOOPsToAdd-- > 0) { + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + } +} diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 3b4cfc4..58064b8 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -19,25 +19,25 @@ #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/Type.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/CommandLine.h" #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" @@ -56,10 +56,12 @@ EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii) - : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::RIP : X86::EIP, + : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true)), + X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), + (tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::RIP : X86::EIP)), TM(tm), TII(tii) { X86_MC::InitLLVM2SEHRegisterMapping(this); @@ -106,23 +108,7 @@ X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { int X86RegisterInfo::getSEHRegNum(unsigned i) const { - int reg = X86_MC::getX86RegNum(i); - switch (i) { - case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: - case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: - case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: - case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: - case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: - case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: - case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: - case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: - case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: - case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: - case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: - case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: - reg += 8; - } - return reg; + return getEncodingValue(i); } const TargetRegisterClass * @@ -206,6 +192,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) return &X86::GR64_TCW64RegClass; if (TM.getSubtarget<X86Subtarget>().is64Bit()) return &X86::GR64_TCRegClass; + + const Function *F = MF.getFunction(); + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; return &X86::GR32_TCRegClass; } } @@ -245,15 +236,28 @@ const uint16_t * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { bool callsEHReturn = false; bool ghcCall = false; + bool oclBiCall = false; + bool hipeCall = false; + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); if (MF) { callsEHReturn = MF->getMMI().callsEHReturn(); const Function *F = MF->getFunction(); ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false); + oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false); + hipeCall = (F ? F->getCallingConv() == CallingConv::HiPE : false); } - if (ghcCall) + if (ghcCall || hipeCall) return CSR_NoRegs_SaveList; + if (oclBiCall) { + if (HasAVX && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX_SaveList; + if (HasAVX && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX_SaveList; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_SaveList; + } if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -268,7 +272,17 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { - if (CC == CallingConv::GHC) + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + + if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX) + return CSR_Win64_Intel_OCL_BI_AVX_RegMask; + if (Is64Bit && HasAVX) + return CSR_64_Intel_OCL_BI_AVX_RegMask; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_RegMask; + } + if (CC == CallingConv::GHC || CC == CallingConv::HiPE) return CSR_NoRegs_RegMask; if (!Is64Bit) return CSR_32_RegMask; @@ -277,6 +291,11 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { return CSR_64_RegMask; } +const uint32_t* +X86RegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); @@ -398,8 +417,10 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttr(Attribute::StackAlignment)); + bool requiresRealignment = + ((MFI->getMaxAlignment() > StackAlign) || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackAlignment)); // If we've requested that we force align the stack do so now. if (ForceStackAlign) @@ -590,9 +611,10 @@ unsigned X86RegisterInfo::getEHHandlerRegister() const { } namespace llvm { -unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { - switch (VT.getSimpleVT().SimpleTy) { - default: return Reg; +unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, + bool High) { + switch (VT) { + default: llvm_unreachable("Unexpected VT"); case MVT::i8: if (High) { switch (Reg) { @@ -608,7 +630,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { } } else { switch (Reg) { - default: return 0; + default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AL; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -645,7 +667,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { } case MVT::i16: switch (Reg) { - default: return Reg; + default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -681,7 +703,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { } case MVT::i32: switch (Reg) { - default: return Reg; + default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::EAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -733,7 +755,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { } } switch (Reg) { - default: return Reg; + default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::RAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -770,46 +792,3 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { } } } - -namespace { - struct MSAH : public MachineFunctionPass { - static char ID; - MSAH() : MachineFunctionPass(ID) {} - - virtual bool runOnMachineFunction(MachineFunction &MF) { - const X86TargetMachine *TM = - static_cast<const X86TargetMachine *>(&MF.getTarget()); - const TargetFrameLowering *TFI = TM->getFrameLowering(); - MachineRegisterInfo &RI = MF.getRegInfo(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - unsigned StackAlignment = TFI->getStackAlignment(); - - // Be over-conservative: scan over all vreg defs and find whether vector - // registers are used. If yes, there is a possibility that vector register - // will be spilled and thus require dynamic stack realignment. - for (unsigned i = 0, e = RI.getNumVirtRegs(); i != e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); - if (RI.getRegClass(Reg)->getAlignment() > StackAlignment) { - FuncInfo->setForceFramePointer(true); - return true; - } - } - // Nothing to do - return false; - } - - virtual const char *getPassName() const { - return "X86 Maximal Stack Alignment Check"; - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - }; - - char MSAH::ID = 0; -} - -FunctionPass* -llvm::createX86MaxStackAlignmentHeuristicPass() { return new MSAH(); } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 1bc32cb..7932ede 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -58,10 +58,6 @@ private: public: X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); - /// getX86RegNum - Returns the native X86 register number for the given LLVM - /// register identifier. - static unsigned getX86RegNum(unsigned RegNo); - // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -104,6 +100,7 @@ public: /// callee-save registers on this target. const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; + const uint32_t *getNoPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number /// indicating if a register is a special register that has particular uses and @@ -141,8 +138,8 @@ public: // getX86SubSuperRegister - X86 utility function. It returns the sub or super // register of a specific X86 register. -// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX -unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false); +// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX +unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); } // End llvm namespace diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index edc7184..be6282a 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -13,258 +13,264 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Register definitions... -// -let Namespace = "X86" in { +class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> { + let Namespace = "X86"; + let HWEncoding = Enc; + let SubRegs = subregs; +} - // Subregister indices. +// Subregister indices. +let Namespace = "X86" in { def sub_8bit : SubRegIndex; def sub_8bit_hi : SubRegIndex; def sub_16bit : SubRegIndex; def sub_32bit : SubRegIndex; - def sub_xmm : SubRegIndex; - - - // In the register alias definitions below, we define which registers alias - // which others. We only specify which registers the small registers alias, - // because the register file generator is smart enough to figure out that - // AL aliases AX if we tell it that AX aliased AL (for example). - - // Dwarf numbering is different for 32-bit and 64-bit, and there are - // variations by target as well. Currently the first entry is for X86-64, - // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux - // and debug information on X86-32/Darwin) - - // 8-bit registers - // Low registers - def AL : Register<"al">; - def DL : Register<"dl">; - def CL : Register<"cl">; - def BL : Register<"bl">; - - // X86-64 only, requires REX. - let CostPerUse = 1 in { - def SIL : Register<"sil">; - def DIL : Register<"dil">; - def BPL : Register<"bpl">; - def SPL : Register<"spl">; - def R8B : Register<"r8b">; - def R9B : Register<"r9b">; - def R10B : Register<"r10b">; - def R11B : Register<"r11b">; - def R12B : Register<"r12b">; - def R13B : Register<"r13b">; - def R14B : Register<"r14b">; - def R15B : Register<"r15b">; - } - - // High registers. On x86-64, these cannot be used in any instruction - // with a REX prefix. - def AH : Register<"ah">; - def DH : Register<"dh">; - def CH : Register<"ch">; - def BH : Register<"bh">; - - // 16-bit registers - let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { - def AX : RegisterWithSubRegs<"ax", [AL,AH]>; - def DX : RegisterWithSubRegs<"dx", [DL,DH]>; - def CX : RegisterWithSubRegs<"cx", [CL,CH]>; - def BX : RegisterWithSubRegs<"bx", [BL,BH]>; - } - let SubRegIndices = [sub_8bit] in { - def SI : RegisterWithSubRegs<"si", [SIL]>; - def DI : RegisterWithSubRegs<"di", [DIL]>; - def BP : RegisterWithSubRegs<"bp", [BPL]>; - def SP : RegisterWithSubRegs<"sp", [SPL]>; - } - def IP : Register<"ip">; - - // X86-64 only, requires REX. - let SubRegIndices = [sub_8bit], CostPerUse = 1 in { - def R8W : RegisterWithSubRegs<"r8w", [R8B]>; - def R9W : RegisterWithSubRegs<"r9w", [R9B]>; - def R10W : RegisterWithSubRegs<"r10w", [R10B]>; - def R11W : RegisterWithSubRegs<"r11w", [R11B]>; - def R12W : RegisterWithSubRegs<"r12w", [R12B]>; - def R13W : RegisterWithSubRegs<"r13w", [R13B]>; - def R14W : RegisterWithSubRegs<"r14w", [R14B]>; - def R15W : RegisterWithSubRegs<"r15w", [R15B]>; - } - // 32-bit registers - let SubRegIndices = [sub_16bit] in { - def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>; - def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>; - def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>; - def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>; - def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>; - def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>; - def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>; - def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>; - def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>; - - // X86-64 only, requires REX - let CostPerUse = 1 in { - def R8D : RegisterWithSubRegs<"r8d", [R8W]>; - def R9D : RegisterWithSubRegs<"r9d", [R9W]>; - def R10D : RegisterWithSubRegs<"r10d", [R10W]>; - def R11D : RegisterWithSubRegs<"r11d", [R11W]>; - def R12D : RegisterWithSubRegs<"r12d", [R12W]>; - def R13D : RegisterWithSubRegs<"r13d", [R13W]>; - def R14D : RegisterWithSubRegs<"r14d", [R14W]>; - def R15D : RegisterWithSubRegs<"r15d", [R15W]>; - }} - - // 64-bit registers, X86-64 only - let SubRegIndices = [sub_32bit] in { - def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>; - def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>; - def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>; - def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>; - def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>; - def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>; - def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>; - def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>; - - // These also require REX. - let CostPerUse = 1 in { - def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>; - def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>; - def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>; - def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>; - def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>; - def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>; - def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>; - def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>; - def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>; - }} - - // MMX Registers. These are actually aliased to ST0 .. ST7 - def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>; - def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>; - def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>; - def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>; - def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>; - def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>; - def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>; - def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>; - - // Pseudo Floating Point registers - def FP0 : Register<"fp0">; - def FP1 : Register<"fp1">; - def FP2 : Register<"fp2">; - def FP3 : Register<"fp3">; - def FP4 : Register<"fp4">; - def FP5 : Register<"fp5">; - def FP6 : Register<"fp6">; - - // XMM Registers, used by the various SSE instruction set extensions. - def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>; - def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>; - def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>; - def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>; - def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>; - def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>; - def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>; - def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>; - - // X86-64 only - let CostPerUse = 1 in { - def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>; - def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>; - def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>; - def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>; - def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>; - def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>; - def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>; - def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>; - } // CostPerUse - - // YMM Registers, used by AVX instructions - let SubRegIndices = [sub_xmm] in { - def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>; - def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>; - def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>; - def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>; - def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>; - def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>; - def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>; - def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>; - def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>; - def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>; - def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>; - def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>; - def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>; - def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>; - def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>; - def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>; - } - - class STRegister<string Name, list<Register> A> : Register<Name> { - let Aliases = A; - } - - // Floating point stack registers. These don't map one-to-one to the FP - // pseudo registers, but we still mark them as aliasing FP registers. That - // way both kinds can be live without exceeding the stack depth. ST registers - // are only live around inline assembly. - def ST0 : STRegister<"st(0)", []>, DwarfRegNum<[33, 12, 11]>; - def ST1 : STRegister<"st(1)", [FP6]>, DwarfRegNum<[34, 13, 12]>; - def ST2 : STRegister<"st(2)", [FP5]>, DwarfRegNum<[35, 14, 13]>; - def ST3 : STRegister<"st(3)", [FP4]>, DwarfRegNum<[36, 15, 14]>; - def ST4 : STRegister<"st(4)", [FP3]>, DwarfRegNum<[37, 16, 15]>; - def ST5 : STRegister<"st(5)", [FP2]>, DwarfRegNum<[38, 17, 16]>; - def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>; - def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>; - - // Floating-point status word - def FPSW : Register<"fpsw">; - - // Status flags register - def EFLAGS : Register<"flags">; - - // Segment registers - def CS : Register<"cs">; - def DS : Register<"ds">; - def SS : Register<"ss">; - def ES : Register<"es">; - def FS : Register<"fs">; - def GS : Register<"gs">; - - // Debug registers - def DR0 : Register<"dr0">; - def DR1 : Register<"dr1">; - def DR2 : Register<"dr2">; - def DR3 : Register<"dr3">; - def DR4 : Register<"dr4">; - def DR5 : Register<"dr5">; - def DR6 : Register<"dr6">; - def DR7 : Register<"dr7">; - - // Control registers - def CR0 : Register<"cr0">; - def CR1 : Register<"cr1">; - def CR2 : Register<"cr2">; - def CR3 : Register<"cr3">; - def CR4 : Register<"cr4">; - def CR5 : Register<"cr5">; - def CR6 : Register<"cr6">; - def CR7 : Register<"cr7">; - def CR8 : Register<"cr8">; - def CR9 : Register<"cr9">; - def CR10 : Register<"cr10">; - def CR11 : Register<"cr11">; - def CR12 : Register<"cr12">; - def CR13 : Register<"cr13">; - def CR14 : Register<"cr14">; - def CR15 : Register<"cr15">; - - // Pseudo index registers - def EIZ : Register<"eiz">; - def RIZ : Register<"riz">; + def sub_xmm : SubRegIndex; } +//===----------------------------------------------------------------------===// +// Register definitions... +// + +// In the register alias definitions below, we define which registers alias +// which others. We only specify which registers the small registers alias, +// because the register file generator is smart enough to figure out that +// AL aliases AX if we tell it that AX aliased AL (for example). + +// Dwarf numbering is different for 32-bit and 64-bit, and there are +// variations by target as well. Currently the first entry is for X86-64, +// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux +// and debug information on X86-32/Darwin) + +// 8-bit registers +// Low registers +def AL : X86Reg<"al", 0>; +def DL : X86Reg<"dl", 2>; +def CL : X86Reg<"cl", 1>; +def BL : X86Reg<"bl", 3>; + +// High registers. On x86-64, these cannot be used in any instruction +// with a REX prefix. +def AH : X86Reg<"ah", 4>; +def DH : X86Reg<"dh", 6>; +def CH : X86Reg<"ch", 5>; +def BH : X86Reg<"bh", 7>; + +// X86-64 only, requires REX. +let CostPerUse = 1 in { +def SIL : X86Reg<"sil", 6>; +def DIL : X86Reg<"dil", 7>; +def BPL : X86Reg<"bpl", 5>; +def SPL : X86Reg<"spl", 4>; +def R8B : X86Reg<"r8b", 8>; +def R9B : X86Reg<"r9b", 9>; +def R10B : X86Reg<"r10b", 10>; +def R11B : X86Reg<"r11b", 11>; +def R12B : X86Reg<"r12b", 12>; +def R13B : X86Reg<"r13b", 13>; +def R14B : X86Reg<"r14b", 14>; +def R15B : X86Reg<"r15b", 15>; +} + +// 16-bit registers +let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { +def AX : X86Reg<"ax", 0, [AL,AH]>; +def DX : X86Reg<"dx", 2, [DL,DH]>; +def CX : X86Reg<"cx", 1, [CL,CH]>; +def BX : X86Reg<"bx", 3, [BL,BH]>; +} +let SubRegIndices = [sub_8bit] in { +def SI : X86Reg<"si", 6, [SIL]>; +def DI : X86Reg<"di", 7, [DIL]>; +def BP : X86Reg<"bp", 5, [BPL]>; +def SP : X86Reg<"sp", 4, [SPL]>; +} +def IP : X86Reg<"ip", 0>; + +// X86-64 only, requires REX. +let SubRegIndices = [sub_8bit], CostPerUse = 1 in { +def R8W : X86Reg<"r8w", 8, [R8B]>; +def R9W : X86Reg<"r9w", 9, [R9B]>; +def R10W : X86Reg<"r10w", 10, [R10B]>; +def R11W : X86Reg<"r11w", 11, [R11B]>; +def R12W : X86Reg<"r12w", 12, [R12B]>; +def R13W : X86Reg<"r13w", 13, [R13B]>; +def R14W : X86Reg<"r14w", 14, [R14B]>; +def R15W : X86Reg<"r15w", 15, [R15B]>; +} + +// 32-bit registers +let SubRegIndices = [sub_16bit] in { +def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>; +def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>; +def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>; +def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>; +def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>; +def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>; +def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>; +def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>; +def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>; + +// X86-64 only, requires REX +let CostPerUse = 1 in { +def R8D : X86Reg<"r8d", 8, [R8W]>; +def R9D : X86Reg<"r9d", 9, [R9W]>; +def R10D : X86Reg<"r10d", 10, [R10W]>; +def R11D : X86Reg<"r11d", 11, [R11W]>; +def R12D : X86Reg<"r12d", 12, [R12W]>; +def R13D : X86Reg<"r13d", 13, [R13W]>; +def R14D : X86Reg<"r14d", 14, [R14W]>; +def R15D : X86Reg<"r15d", 15, [R15W]>; +}} + +// 64-bit registers, X86-64 only +let SubRegIndices = [sub_32bit] in { +def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>; +def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>; +def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>; +def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>; +def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>; +def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>; +def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; +def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; + +// These also require REX. +let CostPerUse = 1 in { +def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; +def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; +def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; +def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>; +def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>; +def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>; +def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; +def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; +def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; +}} + +// MMX Registers. These are actually aliased to ST0 .. ST7 +def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; +def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>; +def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>; +def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>; +def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>; +def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>; +def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>; +def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>; + +// Pseudo Floating Point registers +def FP0 : X86Reg<"fp0", 0>; +def FP1 : X86Reg<"fp1", 0>; +def FP2 : X86Reg<"fp2", 0>; +def FP3 : X86Reg<"fp3", 0>; +def FP4 : X86Reg<"fp4", 0>; +def FP5 : X86Reg<"fp5", 0>; +def FP6 : X86Reg<"fp6", 0>; + +// XMM Registers, used by the various SSE instruction set extensions. +def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; +def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>; +def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>; +def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>; +def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>; +def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>; +def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; +def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; + +// X86-64 only +let CostPerUse = 1 in { +def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; +def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; +def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; +def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>; +def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; +def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; +def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; +def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; +} // CostPerUse + +// YMM Registers, used by AVX instructions +let SubRegIndices = [sub_xmm] in { +def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>; +def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>; +def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>; +def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>; +def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>; +def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>; +def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>; +def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>; +def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>; +def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>; +def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>; +def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>; +def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>; +def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>; +def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>; +def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>; +} + +class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { + let Aliases = A; +} + +// Floating point stack registers. These don't map one-to-one to the FP +// pseudo registers, but we still mark them as aliasing FP registers. That +// way both kinds can be live without exceeding the stack depth. ST registers +// are only live around inline assembly. +def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>; +def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>; +def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>; +def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>; +def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>; +def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>; +def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>; +def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>; + +// Floating-point status word +def FPSW : X86Reg<"fpsw", 0>; + +// Status flags register +def EFLAGS : X86Reg<"flags", 0>; + +// Segment registers +def CS : X86Reg<"cs", 1>; +def DS : X86Reg<"ds", 3>; +def SS : X86Reg<"ss", 2>; +def ES : X86Reg<"es", 0>; +def FS : X86Reg<"fs", 4>; +def GS : X86Reg<"gs", 5>; + +// Debug registers +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; + +// Control registers +def CR0 : X86Reg<"cr0", 0>; +def CR1 : X86Reg<"cr1", 1>; +def CR2 : X86Reg<"cr2", 2>; +def CR3 : X86Reg<"cr3", 3>; +def CR4 : X86Reg<"cr4", 4>; +def CR5 : X86Reg<"cr5", 5>; +def CR6 : X86Reg<"cr6", 6>; +def CR7 : X86Reg<"cr7", 7>; +def CR8 : X86Reg<"cr8", 8>; +def CR9 : X86Reg<"cr9", 9>; +def CR10 : X86Reg<"cr10", 10>; +def CR11 : X86Reg<"cr11", 11>; +def CR12 : X86Reg<"cr12", 12>; +def CR13 : X86Reg<"cr13", 13>; +def CR14 : X86Reg<"cr14", 14>; +def CR15 : X86Reg<"cr15", 15>; + +// Pseudo index registers +def EIZ : X86Reg<"eiz", 4>; +def RIZ : X86Reg<"riz", 4>; + //===----------------------------------------------------------------------===// // Register Class Definitions... now that we have all of the pieces, define the diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 00edcbc..757e8c7 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -13,8 +13,8 @@ #define DEBUG_TYPE "x86-selectiondag-info" #include "X86TargetMachine.h" -#include "llvm/DerivedTypes.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DerivedTypes.h" using namespace llvm; X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : @@ -54,7 +54,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { EVT IntPtr = TLI.getPointerTy(); - Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 0d7b664..53c28f4 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -14,11 +14,11 @@ #define DEBUG_TYPE "subtarget" #include "X86Subtarget.h" #include "X86InstrInfo.h" -#include "llvm/GlobalValue.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -163,17 +163,6 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } -/// getSpecialAddressLatency - For targets where it is beneficial to -/// backschedule instructions that compute addresses, return a value -/// indicating the number of scheduling cycles of backscheduling that -/// should be attempted. -unsigned X86Subtarget::getSpecialAddressLatency() const { - // For x86 out-of-order targets, back-schedule address computations so - // that loads and stores aren't blocked. - // This value was chosen arbitrarily. - return 200; -} - void X86Subtarget::AutoDetectSubtargetFeatures() { unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; unsigned MaxLevel; @@ -245,12 +234,20 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { ToggleFeature(X86::FeatureSlowBTMem); } - // If it's Nehalem, unaligned memory access is fast. - // Include Westmere and Sandy Bridge as well. - // FIXME: add later processors. - if (IsIntel && ((Family == 6 && Model == 26) || - (Family == 6 && Model == 44) || - (Family == 6 && Model == 42))) { + // If it's an Intel chip since Nehalem and not an Atom chip, unaligned + // memory access is fast. We hard code model numbers here because they + // aren't strictly increasing for Intel chips it seems. + if (IsIntel && + ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield, + // Jasper Froest + (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP + (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX + (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale + (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP + (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX + (Family == 6 && Model == 0x2A) || // SandyBridge + (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E* + (Family == 6 && Model == 0x3A))) {// IvyBridge IsUAMemFast = true; ToggleFeature(X86::FeatureFastUAMem); } @@ -313,6 +310,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasBMI2 = true; ToggleFeature(X86::FeatureBMI2); } + if (IsIntel && ((EBX >> 11) & 0x1)) { + HasRTM = true; + ToggleFeature(X86::FeatureRTM); + } } } } @@ -341,6 +342,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasLZCNT(false) , HasBMI(false) , HasBMI2(false) + , HasRTM(false) , IsBTMemSlow(false) , IsUAMemFast(false) , HasVectorUAMem(false) @@ -348,6 +350,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , UseLeaForSP(false) , HasSlowDivide(false) , PostRAScheduler(false) + , PadShortFunctions(false) , stackAlignment(4) // FIXME: this is a known good value for Yonah. How about others? , MaxInlineSizeThreshold(128) @@ -401,6 +404,10 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, } } + // CPUName may have been set by the CPU detection code. Make sure the + // new MCSchedModel is used. + InitMCProcessorInfo(CPUName, FS); + if (X86ProcFamily == IntelAtom) PostRAScheduler = true; @@ -417,12 +424,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, assert((!In64BitMode || HasX86_64) && "64-bit code requested on a subtarget that doesn't support it!"); - // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both + // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) stackAlignment = StackAlignOverride; - else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() || - isTargetSolaris() || In64BitMode) + else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || + In64BitMode) stackAlignment = 16; } diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index dde7e24..080f4cf 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -14,8 +14,8 @@ #ifndef X86SUBTARGET_H #define X86SUBTARGET_H -#include "llvm/CallingConv.h" #include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -118,6 +118,9 @@ protected: /// HasBMI2 - Processor has BMI2 instructions. bool HasBMI2; + /// HasRTM - Processor has RTM instructions. + bool HasRTM; + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -143,6 +146,10 @@ protected: /// PostRAScheduler - True if using post-register-allocation scheduler. bool PostRAScheduler; + /// PadShortFunctions - True if the short functions should be padded to prevent + /// a stall when returning too early. + bool PadShortFunctions; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -202,7 +209,8 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } - bool hasNoAVX() const { return X86SSELevel < AVX; } + bool hasFp256() const { return hasAVX(); } + bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } @@ -220,12 +228,14 @@ public: bool hasLZCNT() const { return HasLZCNT; } bool hasBMI() const { return HasBMI; } bool hasBMI2() const { return HasBMI2; } + bool hasRTM() const { return HasRTM; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasSlowDivide() const { return HasSlowDivide; } + bool padShortFunctions() const { return PadShortFunctions; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -238,13 +248,13 @@ public: bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; } - - // ELF is a reasonably sane default and the only other X86 targets we - // support are Darwin and Windows. Just use "not those". - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetELF() const { + return (TargetTriple.getEnvironment() == Triple::ELF || + TargetTriple.isOSBinFormatELF()); + } bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } bool isTargetNaCl() const { - return TargetTriple.getOS() == Triple::NativeClient; + return TargetTriple.getOS() == Triple::NaCl; } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } @@ -252,7 +262,10 @@ public: bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; } bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; } bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } - bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } + bool isTargetCOFF() const { + return (TargetTriple.getEnvironment() != Triple::ELF && + TargetTriple.isOSBinFormatCOFF()); + } bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); } bool isTargetWin64() const { @@ -303,12 +316,6 @@ public: /// returns null. const char *getBZeroEntry() const; - /// getSpecialAddressLatency - For targets where it is beneficial to - /// backschedule instructions that compute addresses, return a value - /// indicating the number of scheduling cycles of backscheduling that - /// should be attempted. - unsigned getSpecialAddressLatency() const; - /// enablePostRAScheduler - run for Atom optimization. bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index b7ba568..706e64a 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -13,13 +13,13 @@ #include "X86TargetMachine.h" #include "X86.h" -#include "llvm/PassManager.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; extern "C" void LLVMInitializeX86Target() { @@ -36,7 +36,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false), - DataLayout(getSubtargetImpl()->isTargetDarwin() ? + DL(getSubtargetImpl()->isTargetDarwin() ? "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-" "n8:16:32-S128" : (getSubtargetImpl()->isTargetCygMing() || @@ -46,8 +46,8 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-" "n8:16:32-S128"), InstrInfo(*this), - TSInfo(*this), TLInfo(*this), + TSInfo(*this), JITInfo(*this) { } @@ -59,11 +59,11 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true), - DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" + DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" "n8:16:32:64-S128"), InstrInfo(*this), - TSInfo(*this), TLInfo(*this), + TSInfo(*this), JITInfo(*this) { } @@ -78,7 +78,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit), FrameLowering(*this, Subtarget), - ELFWriterInfo(is64Bit, true), InstrItins(Subtarget.getInstrItineraryData()){ // Determine the PICStyle based on the target selected. if (getRelocationModel() == Reloc::Static) { @@ -113,6 +112,25 @@ UseVZeroUpper("x86-use-vzeroupper", cl::desc("Minimize AVX to SSE transition penalty"), cl::init(true)); +// Temporary option to control early if-conversion for x86 while adding machine +// models. +static cl::opt<bool> +X86EarlyIfConv("x86-early-ifcvt", + cl::desc("Enable early if-conversion on X86")); + +//===----------------------------------------------------------------------===// +// X86 Analysis Pass Setup +//===----------------------------------------------------------------------===// + +void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { + // Add first the target-independent BasicTTI pass, then our X86 pass. This + // allows the X86 pass to delegate to the target independent layer when + // appropriate. + PM.add(createBasicTargetTransformInfoPass(getTargetLowering())); + PM.add(createX86TargetTransformInfoPass(this)); +} + + //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// @@ -142,7 +160,7 @@ public: TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { X86PassConfig *PC = new X86PassConfig(this, PM); - if (Subtarget.hasCMov()) + if (X86EarlyIfConv && Subtarget.hasCMov()) PC->enablePass(&EarlyIfConverterID); return PC; @@ -164,7 +182,6 @@ bool X86PassConfig::addInstSelector() { } bool X86PassConfig::addPreRegAlloc() { - addPass(createX86MaxStackAlignmentHeuristicPass()); return false; // -print-machineinstr shouldn't print after this. } @@ -185,6 +202,12 @@ bool X86PassConfig::addPreEmitPass() { ShouldPrint = true; } + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().padShortFunctions()) { + addPass(createX86PadShortFunctions()); + ShouldPrint = true; + } + return ShouldPrint; } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 8e935af..174d391 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -15,16 +15,15 @@ #define X86TARGETMACHINE_H #include "X86.h" -#include "X86ELFWriterInfo.h" -#include "X86InstrInfo.h" -#include "X86ISelLowering.h" #include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" #include "X86JITInfo.h" #include "X86SelectionDAGInfo.h" #include "X86Subtarget.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetData.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -33,7 +32,6 @@ class StringRef; class X86TargetMachine : public LLVMTargetMachine { X86Subtarget Subtarget; X86FrameLowering FrameLowering; - X86ELFWriterInfo ELFWriterInfo; InstrItineraryData InstrItins; public: @@ -62,13 +60,13 @@ public: virtual const X86RegisterInfo *getRegisterInfo() const { return &getInstrInfo()->getRegisterInfo(); } - virtual const X86ELFWriterInfo *getELFWriterInfo() const { - return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; - } virtual const InstrItineraryData *getInstrItineraryData() const { return &InstrItins; } + /// \brief Register X86 analysis passes with a pass manager. + virtual void addAnalysisPasses(PassManagerBase &PM); + // Set up the pass pipeline. virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); @@ -80,17 +78,17 @@ public: /// class X86_32TargetMachine : public X86TargetMachine { virtual void anchor(); - const TargetData DataLayout; // Calculates type size & alignment + const DataLayout DL; // Calculates type size & alignment X86InstrInfo InstrInfo; - X86SelectionDAGInfo TSInfo; X86TargetLowering TLInfo; + X86SelectionDAGInfo TSInfo; X86JITInfo JITInfo; public: X86_32TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const DataLayout *getDataLayout() const { return &DL; } virtual const X86TargetLowering *getTargetLowering() const { return &TLInfo; } @@ -109,17 +107,17 @@ public: /// class X86_64TargetMachine : public X86TargetMachine { virtual void anchor(); - const TargetData DataLayout; // Calculates type size & alignment + const DataLayout DL; // Calculates type size & alignment X86InstrInfo InstrInfo; - X86SelectionDAGInfo TSInfo; X86TargetLowering TLInfo; + X86SelectionDAGInfo TSInfo; X86JITInfo JITInfo; public: X86_64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const DataLayout *getDataLayout() const { return &DL; } virtual const X86TargetLowering *getTargetLowering() const { return &TLInfo; } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 92aee0d..b8ee319 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -15,16 +15,16 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" -#include "llvm/Target/Mangler.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ELF.h" +#include "llvm/Target/Mangler.h" using namespace llvm; using namespace dwarf; const MCExpr *X86_64MachoTargetObjectFile:: -getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, - MachineModuleInfo *MMI, unsigned Encoding, - MCStreamer &Streamer) const { +getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const { // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which // is an indirect pc-relative reference. @@ -37,7 +37,7 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, } return TargetLoweringObjectFileMachO:: - getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); + getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer); } MCSymbol *X86_64MachoTargetObjectFile:: diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 2d320c5..9d26d38 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -11,8 +11,8 @@ #define LLVM_TARGET_X86_TARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -21,9 +21,9 @@ namespace llvm { class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: virtual const MCExpr * - getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, - MachineModuleInfo *MMI, unsigned Encoding, - MCStreamer &Streamer) const; + getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const; // getCFIPersonalitySymbol - The symbol that gets passed to // .cfi_personality. diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp new file mode 100644 index 0000000..9cc1b18 --- /dev/null +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -0,0 +1,355 @@ +//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86tti" +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +// Declare the pass initialization routine locally as target-specific passes +// don't havve a target-wide initialization entry point, and so we rely on the +// pass constructor initialization. +namespace llvm { +void initializeX86TTIPass(PassRegistry &); +} + +namespace { + +class X86TTI : public ImmutablePass, public TargetTransformInfo { + const X86TargetMachine *TM; + const X86Subtarget *ST; + const X86TargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + +public: + X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + llvm_unreachable("This pass cannot be directly constructed"); + } + + X86TTI(const X86TargetMachine *TM) + : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { + initializeX86TTIPass(*PassRegistry::getPassRegistry()); + } + + virtual void initializePass() { + pushTTIStack(this); + } + + virtual void finalizePass() { + popTTIStack(); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + TargetTransformInfo::getAnalysisUsage(AU); + } + + /// Pass identification. + static char ID; + + /// Provide necessary pointer adjustments for the two base classes. + virtual void *getAdjustedAnalysisPointer(const void *ID) { + if (ID == &TargetTransformInfo::ID) + return (TargetTransformInfo*)this; + return this; + } + + /// \name Scalar TTI Implementations + /// @{ + + virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; + virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) const; + virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const; + virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const; + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const; + virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const; + + /// @} +}; + +} // end anonymous namespace + +INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", + "X86 Target Transform Info", true, true, false) +char X86TTI::ID = 0; + +ImmutablePass * +llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { + return new X86TTI(TM); +} + + +//===----------------------------------------------------------------------===// +// +// X86 cost model. +// +//===----------------------------------------------------------------------===// + +namespace { +struct X86CostTblEntry { + int ISD; + MVT Type; + unsigned Cost; +}; +} + +static int +FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) { + for (unsigned int i = 0; i < len; ++i) + if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty) + return i; + + // Could not find an entry. + return -1; +} + +namespace { +struct X86TypeConversionCostTblEntry { + int ISD; + MVT Dst; + MVT Src; + unsigned Cost; +}; +} + +static int +FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, + int ISD, MVT Dst, MVT Src) { + for (unsigned int i = 0; i < len; ++i) + if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst) + return i; + + // Could not find an entry. + return -1; +} + + +X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + // TODO: Currently the __builtin_popcount() implementation using SSE3 + // instructions is inefficient. Once the problem is fixed, we should + // call ST->hasSSE3() instead of ST->hasSSE4(). + return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; +} + +unsigned X86TTI::getNumberOfRegisters(bool Vector) const { + if (ST->is64Bit()) + return 16; + return 8; +} + +unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + static const X86CostTblEntry AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::MUL, MVT::v4i64, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + }; + + // Look for AVX1 lowering tricks. + if (ST->hasAVX()) { + int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD, + LT.second); + if (Idx != -1) + return LT.first * AVX1CostTable[Idx].Cost; + } + // Fallback to the default implementation. + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty); +} + +unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) const { + // We only estimate the cost of reverse shuffles. + if (Kind != SK_Reverse) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; +} + +unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + EVT SrcTy = TLI->getValueType(Src); + EVT DstTy = TLI->getValueType(Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + + static const X86TypeConversionCostTblEntry AVXConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, + }; + + if (ST->hasAVX()) { + int Idx = FindInConvertTable(AVXConversionTbl, + array_lengthof(AVXConversionTbl), + ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + if (Idx != -1) + return AVXConversionTbl[Idx].Cost; + } + + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); +} + +unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + + MVT MTy = LT.second; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + static const X86CostTblEntry SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + }; + + static const X86CostTblEntry AVX1CostTbl[] = { + { ISD::SETCC, MVT::v4f64, 1 }, + { ISD::SETCC, MVT::v8f32, 1 }, + // AVX1 does not support 8-wide integer compare. + { ISD::SETCC, MVT::v4i64, 4 }, + { ISD::SETCC, MVT::v8i32, 4 }, + { ISD::SETCC, MVT::v16i16, 4 }, + { ISD::SETCC, MVT::v32i8, 4 }, + }; + + static const X86CostTblEntry AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + }; + + if (ST->hasAVX2()) { + int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX2CostTbl[Idx].Cost; + } + + if (ST->hasAVX()) { + int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX1CostTbl[Idx].Cost; + } + + if (ST->hasSSE42()) { + int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * SSE42CostTbl[Idx].Cost; + } + + return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // Floating point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) + return 0; + } + + return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); +} + +unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + // Each load/store unit costs 1. + unsigned Cost = LT.first * 1; + + // On Sandybridge 256bit load/stores are double pumped + // (but not on Haswell). + if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) + Cost*=2; + + return Cost; +} diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 449eed3..c4a5887 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -147,7 +147,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *RC = &X86::VR256RegClass; for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; i++) { - if (MRI.isPhysRegUsed(*i)) { + if (!MRI.reg_nodbg_empty(*i)) { YMMUsed = true; break; } |
