diff options
Diffstat (limited to 'lib/Target/X86')
73 files changed, 8198 insertions, 3355 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index f159bb2..a9c413d 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -15,6 +15,7 @@ x86_codegen_SRC_FILES := \ X86COFFMachineModuleInfo.cpp \ X86CodeEmitter.cpp \ X86FastISel.cpp \ + X86FixupLEAs.cpp \ X86FloatingPoint.cpp \ X86FrameLowering.cpp \ X86ISelDAGToDAG.cpp \ diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 4ed5534a6..ad83d97 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -9,10 +9,12 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCParser/MCAsmLexer.h" @@ -32,17 +34,453 @@ using namespace llvm; namespace { struct X86Operand; +static const char OpPrecedence[] = { + 0, // IC_PLUS + 0, // IC_MINUS + 1, // IC_MULTIPLY + 1, // IC_DIVIDE + 2, // IC_RPAREN + 3, // IC_LPAREN + 0, // IC_IMM + 0 // IC_REGISTER +}; + class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; MCAsmParser &Parser; ParseInstructionInfo *InstInfo; private: + enum InfixCalculatorTok { + IC_PLUS = 0, + IC_MINUS, + IC_MULTIPLY, + IC_DIVIDE, + IC_RPAREN, + IC_LPAREN, + IC_IMM, + IC_REGISTER + }; + + class InfixCalculator { + typedef std::pair< InfixCalculatorTok, int64_t > ICToken; + SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; + SmallVector<ICToken, 4> PostfixStack; + + public: + int64_t popOperand() { + assert (!PostfixStack.empty() && "Poped an empty stack!"); + ICToken Op = PostfixStack.pop_back_val(); + assert ((Op.first == IC_IMM || Op.first == IC_REGISTER) + && "Expected and immediate or register!"); + return Op.second; + } + void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) { + assert ((Op == IC_IMM || Op == IC_REGISTER) && + "Unexpected operand!"); + PostfixStack.push_back(std::make_pair(Op, Val)); + } + + void popOperator() { InfixOperatorStack.pop_back_val(); } + void pushOperator(InfixCalculatorTok Op) { + // Push the new operator if the stack is empty. + if (InfixOperatorStack.empty()) { + InfixOperatorStack.push_back(Op); + return; + } + + // Push the new operator if it has a higher precedence than the operator + // on the top of the stack or the operator on the top of the stack is a + // left parentheses. + unsigned Idx = InfixOperatorStack.size() - 1; + InfixCalculatorTok StackOp = InfixOperatorStack[Idx]; + if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) { + InfixOperatorStack.push_back(Op); + return; + } + + // The operator on the top of the stack has higher precedence than the + // new operator. + unsigned ParenCount = 0; + while (1) { + // Nothing to process. + if (InfixOperatorStack.empty()) + break; + + Idx = InfixOperatorStack.size() - 1; + StackOp = InfixOperatorStack[Idx]; + if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount)) + break; + + // If we have an even parentheses count and we see a left parentheses, + // then stop processing. + if (!ParenCount && StackOp == IC_LPAREN) + break; + + if (StackOp == IC_RPAREN) { + ++ParenCount; + InfixOperatorStack.pop_back_val(); + } else if (StackOp == IC_LPAREN) { + --ParenCount; + InfixOperatorStack.pop_back_val(); + } else { + InfixOperatorStack.pop_back_val(); + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + } + // Push the new operator. + InfixOperatorStack.push_back(Op); + } + int64_t execute() { + // Push any remaining operators onto the postfix stack. + while (!InfixOperatorStack.empty()) { + InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val(); + if (StackOp != IC_LPAREN && StackOp != IC_RPAREN) + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + + if (PostfixStack.empty()) + return 0; + + SmallVector<ICToken, 16> OperandStack; + for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) { + ICToken Op = PostfixStack[i]; + if (Op.first == IC_IMM || Op.first == IC_REGISTER) { + OperandStack.push_back(Op); + } else { + assert (OperandStack.size() > 1 && "Too few operands."); + int64_t Val; + ICToken Op2 = OperandStack.pop_back_val(); + ICToken Op1 = OperandStack.pop_back_val(); + switch (Op.first) { + default: + report_fatal_error("Unexpected operator!"); + break; + case IC_PLUS: + Val = Op1.second + Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MINUS: + Val = Op1.second - Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MULTIPLY: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Multiply operation with an immediate and a register!"); + Val = Op1.second * Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_DIVIDE: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Divide operation with an immediate and a register!"); + assert (Op2.second != 0 && "Division by zero!"); + Val = Op1.second / Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + } + } + } + assert (OperandStack.size() == 1 && "Expected a single result."); + return OperandStack.pop_back_val().second; + } + }; + + enum IntelExprState { + IES_PLUS, + IES_MINUS, + IES_MULTIPLY, + IES_DIVIDE, + IES_LBRAC, + IES_RBRAC, + IES_LPAREN, + IES_RPAREN, + IES_REGISTER, + IES_INTEGER, + IES_IDENTIFIER, + IES_ERROR + }; + + class IntelExprStateMachine { + IntelExprState State, PrevState; + unsigned BaseReg, IndexReg, TmpReg, Scale; + int64_t Imm; + const MCExpr *Sym; + StringRef SymName; + bool StopOnLBrac, AddImmPrefix; + InfixCalculator IC; + InlineAsmIdentifierInfo Info; + public: + IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : + State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), + Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac), + AddImmPrefix(addimmprefix) { Info.clear(); } + + unsigned getBaseReg() { return BaseReg; } + unsigned getIndexReg() { return IndexReg; } + unsigned getScale() { return Scale; } + const MCExpr *getSym() { return Sym; } + StringRef getSymName() { return SymName; } + int64_t getImm() { return Imm + IC.execute(); } + bool isValidEndState() { + return State == IES_RBRAC || State == IES_INTEGER; + } + bool getStopOnLBrac() { return StopOnLBrac; } + bool getAddImmPrefix() { return AddImmPrefix; } + bool hadError() { return State == IES_ERROR; } + + InlineAsmIdentifierInfo &getIdentifierInfo() { + return Info; + } + + void onPlus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onMinus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + case IES_RPAREN: + case IES_LBRAC: + case IES_RBRAC: + case IES_INTEGER: + case IES_REGISTER: + State = IES_MINUS; + // Only push the minus operator if it is not a unary operator. + if (!(CurrState == IES_PLUS || CurrState == IES_MINUS || + CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE || + CurrState == IES_LPAREN || CurrState == IES_LBRAC)) + IC.pushOperator(IC_MINUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onRegister(unsigned Reg) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_LPAREN: + State = IES_REGISTER; + TmpReg = Reg; + IC.pushOperand(IC_REGISTER); + break; + case IES_MULTIPLY: + // Index Register - Scale * Register + if (PrevState == IES_INTEGER) { + assert (!IndexReg && "IndexReg already set!"); + State = IES_REGISTER; + IndexReg = Reg; + // Get the scale and replace the 'Scale * Register' with '0'. + Scale = IC.popOperand(); + IC.pushOperand(IC_IMM); + IC.popOperator(); + } else { + State = IES_ERROR; + } + break; + } + PrevState = CurrState; + } + void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + State = IES_INTEGER; + Sym = SymRef; + SymName = SymRefName; + IC.pushOperand(IC_IMM); + break; + } + } + void onInteger(int64_t TmpInt) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_DIVIDE: + case IES_MULTIPLY: + case IES_LPAREN: + State = IES_INTEGER; + if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { + // Index Register - Register * Scale + assert (!IndexReg && "IndexReg already set!"); + IndexReg = TmpReg; + Scale = TmpInt; + // Get the scale and replace the 'Register * Scale' with '0'. + IC.popOperator(); + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + CurrState == IES_MINUS) { + // Unary minus. No need to pop the minus operand because it was never + // pushed. + IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else { + IC.pushOperand(IC_IMM, TmpInt); + } + break; + } + PrevState = CurrState; + } + void onStar() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_MULTIPLY; + IC.pushOperator(IC_MULTIPLY); + break; + } + } + void onDivide() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + State = IES_DIVIDE; + IC.pushOperator(IC_DIVIDE); + break; + } + } + void onLBrac() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_RBRAC: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + break; + } + } + void onRBrac() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RBRAC; + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onLParen() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + // FIXME: We don't handle this type of unary minus, yet. + if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + CurrState == IES_MINUS) { + State = IES_ERROR; + break; + } + State = IES_LPAREN; + IC.pushOperator(IC_LPAREN); + break; + } + PrevState = CurrState; + } + void onRParen() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RPAREN; + IC.pushOperator(IC_RPAREN); + break; + } + } + }; + MCAsmParser &getParser() const { return Parser; } MCAsmLexer &getLexer() const { return Parser.getLexer(); } bool Error(SMLoc L, const Twine &Msg, - ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(), + ArrayRef<SMRange> Ranges = None, bool MatchingInlineAsm = false) { if (MatchingInlineAsm) return true; return Parser.Error(L, Msg, Ranges); @@ -56,14 +494,25 @@ private: X86Operand *ParseOperand(); X86Operand *ParseATTOperand(); X86Operand *ParseIntelOperand(); - X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc); - X86Operand *ParseIntelOperator(SMLoc StartLoc, unsigned OpKind); - X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc); - X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size); + X86Operand *ParseIntelOffsetOfOperator(); + X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); + X86Operand *ParseIntelOperator(unsigned OpKind); + X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp, + SMLoc StartLoc); + X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); + X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, unsigned Size); + X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End); + X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); - bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp, - SmallString<64> &Err); + X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); @@ -93,6 +542,10 @@ private: setAvailableFeatures(FB); } + bool isParsingIntelSyntax() { + return getParser().getAssemblerDialect(); + } + /// @name Auto-generated Matcher Functions /// { @@ -115,10 +568,6 @@ public: SmallVectorImpl<MCParsedAsmOperand*> &Operands); virtual bool ParseDirective(AsmToken DirectiveID); - - bool isParsingIntelSyntax() { - return getParser().getAssemblerDialect(); - } }; } // end anonymous namespace @@ -168,6 +617,8 @@ struct X86Operand : public MCParsedAsmOperand { SMLoc StartLoc, EndLoc; SMLoc OffsetOfLoc; + StringRef SymName; + void *OpDecl; bool AddressOf; struct TokOp { @@ -181,7 +632,6 @@ struct X86Operand : public MCParsedAsmOperand { struct ImmOp { const MCExpr *Val; - bool NeedAsmRewrite; }; struct MemOp { @@ -191,7 +641,6 @@ struct X86Operand : public MCParsedAsmOperand { unsigned IndexReg; unsigned Scale; unsigned Size; - bool NeedSizeDir; }; union { @@ -204,6 +653,9 @@ struct X86Operand : public MCParsedAsmOperand { X86Operand(KindTy K, SMLoc Start, SMLoc End) : Kind(K), StartLoc(Start), EndLoc(End) {} + StringRef getSymName() { return SymName; } + void *getOpDecl() { return OpDecl; } + /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. @@ -236,11 +688,6 @@ struct X86Operand : public MCParsedAsmOperand { return Imm.Val; } - bool needAsmRewrite() const { - assert(Kind == Immediate && "Invalid access!"); - return Imm.NeedAsmRewrite; - } - const MCExpr *getMemDisp() const { assert(Kind == Memory && "Invalid access!"); return Mem.Disp; @@ -337,11 +784,6 @@ struct X86Operand : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } - unsigned getMemSize() const { - assert(Kind == Memory && "Invalid access!"); - return Mem.Size; - } - bool isOffsetOf() const { return OffsetOfLoc.getPointer(); } @@ -350,11 +792,6 @@ struct X86Operand : public MCParsedAsmOperand { return AddressOf; } - bool needSizeDirective() const { - assert(Kind == Memory && "Invalid access!"); - return Mem.NeedSizeDir; - } - bool isMem() const { return Kind == Memory; } bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); @@ -394,6 +831,18 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVZ32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + bool isMemVZ64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + + bool isMem512() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 512); + } bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && @@ -454,6 +903,16 @@ struct X86Operand : public MCParsedAsmOperand { addMemOperands(Inst, N); } + void addMemVZ32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVZ64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem512Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); @@ -482,25 +941,28 @@ struct X86Operand : public MCParsedAsmOperand { static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, bool AddressOf = false, - SMLoc OffsetOfLoc = SMLoc()) { + SMLoc OffsetOfLoc = SMLoc(), + StringRef SymName = StringRef(), + void *OpDecl = 0) { X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; Res->OffsetOfLoc = OffsetOfLoc; + Res->SymName = SymName; + Res->OpDecl = OpDecl; return Res; } - static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc, - bool NeedRewrite = true){ + static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; - Res->Imm.NeedAsmRewrite = NeedRewrite; return Res; } /// Create an absolute memory operand. static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, bool NeedSizeDir = false) { + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = 0) { X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -508,8 +970,9 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; - Res->Mem.NeedSizeDir = NeedSizeDir; - Res->AddressOf = false; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; return Res; } @@ -517,7 +980,9 @@ struct X86Operand : public MCParsedAsmOperand { static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, bool NeedSizeDir = false) { + unsigned Size = 0, + StringRef SymName = StringRef(), + void *OpDecl = 0) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -532,8 +997,9 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; - Res->Mem.NeedSizeDir = NeedSizeDir; - Res->AddressOf = false; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; return Res; } }; @@ -689,251 +1155,105 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { return Size; } -enum IntelBracExprState { - IBES_START, - IBES_LBRAC, - IBES_RBRAC, - IBES_REGISTER, - IBES_REGISTER_STAR, - IBES_REGISTER_STAR_INTEGER, - IBES_INTEGER, - IBES_INTEGER_STAR, - IBES_INDEX_REGISTER, - IBES_IDENTIFIER, - IBES_DISP_EXPR, - IBES_MINUS, - IBES_ERROR -}; - -class IntelBracExprStateMachine { - IntelBracExprState State; - unsigned BaseReg, IndexReg, Scale; - int64_t Disp; - - unsigned TmpReg; - int64_t TmpInteger; - - bool isPlus; - -public: - IntelBracExprStateMachine(MCAsmParser &parser) : - State(IBES_START), BaseReg(0), IndexReg(0), Scale(1), Disp(0), - TmpReg(0), TmpInteger(0), isPlus(true) {} - - unsigned getBaseReg() { return BaseReg; } - unsigned getIndexReg() { return IndexReg; } - unsigned getScale() { return Scale; } - int64_t getDisp() { return Disp; } - bool isValidEndState() { return State == IBES_RBRAC; } - - void onPlus() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_INTEGER: - State = IBES_START; - if (isPlus) - Disp += TmpInteger; - else - Disp -= TmpInteger; - break; - case IBES_REGISTER: - State = IBES_START; - // If we already have a BaseReg, then assume this is the IndexReg with a - // scale of 1. - if (!BaseReg) { - BaseReg = TmpReg; - } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); - IndexReg = TmpReg; - Scale = 1; - } - break; - case IBES_INDEX_REGISTER: - State = IBES_START; - break; - } - isPlus = true; - } - void onMinus() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_START: - State = IBES_MINUS; - break; - case IBES_INTEGER: - State = IBES_START; - if (isPlus) - Disp += TmpInteger; - else - Disp -= TmpInteger; - break; - case IBES_REGISTER: - State = IBES_START; - // If we already have a BaseReg, then assume this is the IndexReg with a - // scale of 1. - if (!BaseReg) { - BaseReg = TmpReg; - } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); - IndexReg = TmpReg; - Scale = 1; - } - break; - case IBES_INDEX_REGISTER: - State = IBES_START; - break; - } - isPlus = false; - } - void onRegister(unsigned Reg) { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_START: - State = IBES_REGISTER; - TmpReg = Reg; - break; - case IBES_INTEGER_STAR: - assert (!IndexReg && "IndexReg already set!"); - State = IBES_INDEX_REGISTER; - IndexReg = Reg; - Scale = TmpInteger; - break; - } - } - void onDispExpr() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_START: - State = IBES_DISP_EXPR; - break; - } - } - void onInteger(int64_t TmpInt) { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_START: - State = IBES_INTEGER; - TmpInteger = TmpInt; - break; - case IBES_MINUS: - State = IBES_INTEGER; - TmpInteger = TmpInt; - break; - case IBES_REGISTER_STAR: - assert (!IndexReg && "IndexReg already set!"); - State = IBES_INDEX_REGISTER; - IndexReg = TmpReg; - Scale = TmpInt; - break; - } - } - void onStar() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_INTEGER: - State = IBES_INTEGER_STAR; - break; - case IBES_REGISTER: - State = IBES_REGISTER_STAR; - break; +X86Operand * +X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info){ + if (isa<MCSymbolRefExpr>(Disp)) { + // If this is not a VarDecl then assume it is a FuncDecl or some other label + // reference. We need an 'r' constraint here, so we need to create register + // operand to ensure proper matching. Just pick a GPR based on the size of + // a pointer. + if (!Info.IsVarDecl) { + unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; + return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true, + SMLoc(), Identifier, Info.OpDecl); } - } - void onLBrac() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_RBRAC: - State = IBES_START; - isPlus = true; - break; + if (!Size) { + Size = Info.Type * 8; // Size is in terms of bits in this context. + if (Size) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, + /*Len=*/0, Size)); } } - void onRBrac() { - switch (State) { - default: - State = IBES_ERROR; - break; - case IBES_DISP_EXPR: - State = IBES_RBRAC; - break; - case IBES_INTEGER: - State = IBES_RBRAC; - if (isPlus) - Disp += TmpInteger; - else - Disp -= TmpInteger; - break; - case IBES_REGISTER: - State = IBES_RBRAC; - // If we already have a BaseReg, then assume this is the IndexReg with a - // scale of 1. - if (!BaseReg) { - BaseReg = TmpReg; - } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); - IndexReg = TmpReg; - Scale = 1; + + // When parsing inline assembly we set the base register to a non-zero value + // if we don't know the actual value at this time. This is necessary to + // get the matching correct in some cases. + BaseReg = BaseReg ? BaseReg : 1; + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, Identifier, Info.OpDecl); +} + +static void +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, + StringRef SymName, int64_t ImmDisp, + int64_t FinalImmDisp, SMLoc &BracLoc, + SMLoc &StartInBrac, SMLoc &End) { + // Remove the '[' and ']' from the IR string. + AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1)); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1)); + + // If ImmDisp is non-zero, then we parsed a displacement before the + // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) + // If ImmDisp doesn't match the displacement computed by the state machine + // then we have an additional displacement in the bracketed expression. + if (ImmDisp != FinalImmDisp) { + if (ImmDisp) { + // We have an immediate displacement before the bracketed expression. + // Adjust this to match the final immediate displacement. + bool Found = false; + for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), + E = AsmRewrites->end(); I != E; ++I) { + if ((*I).Loc.getPointer() > BracLoc.getPointer()) + continue; + if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) { + assert (!Found && "ImmDisp already rewritten."); + (*I).Kind = AOK_Imm; + (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer(); + (*I).Val = FinalImmDisp; + Found = true; + break; + } } - break; - case IBES_INDEX_REGISTER: - State = IBES_RBRAC; - break; + assert (Found && "Unable to rewrite ImmDisp."); + (void)Found; + } else { + // We have a symbolic and an immediate displacement, but no displacement + // before the bracketed expression. Put the immediate displacement + // before the bracketed expression. + AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp)); } } -}; + // Remove all the ImmPrefix rewrites within the brackets. + for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), + E = AsmRewrites->end(); I != E; ++I) { + if ((*I).Loc.getPointer() < StartInBrac.getPointer()) + continue; + if ((*I).Kind == AOK_ImmPrefix) + (*I).Kind = AOK_Delete; + } + const char *SymLocPtr = SymName.data(); + // Skip everything before the symbol. + if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); + } + // Skip everything after the symbol. + if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { + SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len)); + } +} -X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, - unsigned Size) { +X86Operand * +X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { const AsmToken &Tok = Parser.getTok(); - SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc(); - - // Eat '[' - if (getLexer().isNot(AsmToken::LBrac)) - return ErrorOperand(Start, "Expected '[' token!"); - Parser.Lex(); - unsigned TmpReg = 0; - - // Try to handle '[' 'symbol' ']' - if (getLexer().is(AsmToken::Identifier)) { - if (ParseRegister(TmpReg, Start, End)) { - const MCExpr *Disp; - if (getParser().parseExpression(Disp, End)) - return 0; - - if (getLexer().isNot(AsmToken::RBrac)) - return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!"); - // Adjust the EndLoc due to the ']'. - End = SMLoc::getFromPointer(Parser.getTok().getEndLoc().getPointer()-1); - Parser.Lex(); - return X86Operand::CreateMem(Disp, Start, End, Size); - } - } - - // Parse [ BaseReg + Scale*IndexReg + Disp ]. bool Done = false; - IntelBracExprStateMachine SM(Parser); - - // If we parsed a register, then the end loc has already been set and - // the identifier has already been lexed. We also need to update the - // state. - if (TmpReg) - SM.onRegister(TmpReg); - - const MCExpr *Disp = 0; while (!Done) { bool UpdateLocLex = true; @@ -941,6 +1261,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, // identifier. Don't try an parse it as a register. if (Tok.getString().startswith(".")) break; + + // If we're parsing an immediate expression, we don't expect a '['. + if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) + break; switch (getLexer().getKind()) { default: { @@ -950,82 +1274,185 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, } return ErrorOperand(Tok.getLoc(), "Unexpected token!"); } + case AsmToken::EndOfStatement: { + Done = true; + break; + } case AsmToken::Identifier: { - // This could be a register or a displacement expression. - if(!ParseRegister(TmpReg, Start, End)) { + // This could be a register or a symbolic displacement. + unsigned TmpReg; + const MCExpr *Val; + SMLoc IdentLoc = Tok.getLoc(); + StringRef Identifier = Tok.getString(); + if(!ParseRegister(TmpReg, IdentLoc, End)) { SM.onRegister(TmpReg); UpdateLocLex = false; break; - } else if (!getParser().parseExpression(Disp, End)) { - SM.onDispExpr(); + } else { + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); + } else { + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; + } + SM.onIdentifierExpr(Val, Identifier); UpdateLocLex = false; break; } return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); } - case AsmToken::Integer: { - int64_t Val = Tok.getIntVal(); - SM.onInteger(Val); + case AsmToken::Integer: + if (isParsingInlineAsm() && SM.getAddImmPrefix()) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, + Tok.getLoc())); + SM.onInteger(Tok.getIntVal()); break; - } case AsmToken::Plus: SM.onPlus(); break; case AsmToken::Minus: SM.onMinus(); break; case AsmToken::Star: SM.onStar(); break; + case AsmToken::Slash: SM.onDivide(); break; case AsmToken::LBrac: SM.onLBrac(); break; case AsmToken::RBrac: SM.onRBrac(); break; + case AsmToken::LParen: SM.onLParen(); break; + case AsmToken::RParen: SM.onRParen(); break; } + if (SM.hadError()) + return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + if (!Done && UpdateLocLex) { End = Tok.getLoc(); Parser.Lex(); // Consume the token. } } + return 0; +} - if (!Disp) - Disp = MCConstantExpr::Create(SM.getDisp(), getContext()); +X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, + unsigned Size) { + const AsmToken &Tok = Parser.getTok(); + SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(BracLoc, "Expected '[' token!"); + Parser.Lex(); // Eat '[' + + SMLoc StartInBrac = Tok.getLoc(); + // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We + // may have already parsed an immediate displacement before the bracketed + // expression. + IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); + if (X86Operand *Err = ParseIntelExpression(SM, End)) + return Err; + + const MCExpr *Disp; + if (const MCExpr *Sym = SM.getSym()) { + // A symbolic displacement. + Disp = Sym; + if (isParsingInlineAsm()) + RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(), + ImmDisp, SM.getImm(), BracLoc, StartInBrac, + End); + } else { + // An immediate displacement only. + Disp = MCConstantExpr::Create(SM.getImm(), getContext()); + } // Parse the dot operator (e.g., [ebx].foo.bar). if (Tok.getString().startswith(".")) { - SmallString<64> Err; const MCExpr *NewDisp; - if (ParseIntelDotOperator(Disp, &NewDisp, Err)) - return ErrorOperand(Tok.getLoc(), Err); + if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp)) + return Err; - End = Parser.getTok().getEndLoc(); + End = Tok.getEndLoc(); Parser.Lex(); // Eat the field. Disp = NewDisp; } int BaseReg = SM.getBaseReg(); int IndexReg = SM.getIndexReg(); + int Scale = SM.getScale(); + if (!isParsingInlineAsm()) { + // handle [-42] + if (!BaseReg && !IndexReg) { + if (!SegReg) + return X86Operand::CreateMem(Disp, Start, End, Size); + else + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size); + } + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size); + } - // handle [-42] - if (!BaseReg && !IndexReg) { - if (!SegReg) - return X86Operand::CreateMem(Disp, Start, End); - else - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size); + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, SM.getSymName(), Info); +} + +// Inline assembly may use variable names with namespace alias qualifiers. +X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, + StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, + SMLoc &End) { + assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); + Val = 0; + + StringRef LineBuf(Identifier.data()); + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + + const AsmToken &Tok = Parser.getTok(); + + // Advance the token stream until the end of the current token is + // after the end of what the frontend claimed. + const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); + while (true) { + End = Tok.getEndLoc(); + getLexer().Lex(); + + assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); + if (End.getPointer() == EndPtr) break; } - int Scale = SM.getScale(); - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, - Start, End, Size); + // Create the symbol reference. + Identifier = LineBuf; + MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); + return 0; } /// ParseIntelMemOperand - Parse intel style memory operand. -X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) { +X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, + int64_t ImmDisp, + SMLoc Start) { const AsmToken &Tok = Parser.getTok(); SMLoc End; unsigned Size = getIntelMemOperandSize(Tok.getString()); if (Size) { - Parser.Lex(); - assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") && - "Unexpected token!"); - Parser.Lex(); + Parser.Lex(); // Eat operand size (e.g., byte, word). + if (Tok.getString() != "PTR" && Tok.getString() != "ptr") + return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); + Parser.Lex(); // Eat ptr. + } + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + if (getLexer().is(AsmToken::Integer)) { + if (isParsingInlineAsm()) + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, + Tok.getLoc())); + int64_t ImmDisp = Tok.getIntVal(); + Parser.Lex(); // Eat the integer. + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(Start, "Expected '[' token!"); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); } if (getLexer().is(AsmToken::LBrac)) - return ParseIntelBracExpression(SegReg, Size); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); if (!ParseRegister(SegReg, Start, End)) { // Handel SegReg : [ ... ] @@ -1034,63 +1461,37 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) { Parser.Lex(); // Eat : if (getLexer().isNot(AsmToken::LBrac)) return ErrorOperand(Start, "Expected '[' token!"); - return ParseIntelBracExpression(SegReg, Size); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); } - const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); - if (getParser().parseExpression(Disp, End)) - return 0; + const MCExpr *Val; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "Unexpected token!"); - bool NeedSizeDir = false; - bool IsVarDecl = false; - if (isParsingInlineAsm()) { - if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) { - const MCSymbol &Sym = SymRef->getSymbol(); - // FIXME: The SemaLookup will fail if the name is anything other then an - // identifier. - // FIXME: Pass a valid SMLoc. - unsigned tLength, tSize, tType; - SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, tLength, - tSize, tType, IsVarDecl); - if (!Size) - Size = tType * 8; // Size is in terms of bits in this context. - NeedSizeDir = Size > 0; - } + return X86Operand::CreateMem(Val, Start, End, Size); } - if (!isParsingInlineAsm()) - return X86Operand::CreateMem(Disp, Start, End, Size); - else { - // If this is not a VarDecl then assume it is a FuncDecl or some other label - // reference. We need an 'r' constraint here, so we need to create register - // operand to ensure proper matching. Just pick a GPR based on the size of - // a pointer. - if (!IsVarDecl) { - unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; - return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true); - } - // When parsing inline assembly we set the base register to a non-zero value - // as we don't know the actual value at this time. This is necessary to - // get the matching correct in some cases. - return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0, - /*Scale*/1, Start, End, Size, NeedSizeDir); - } + InlineAsmIdentifierInfo Info; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, Identifier, Info); } /// Parse the '.' operator. -bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, - const MCExpr **NewDisp, - SmallString<64> &Err) { - AsmToken Tok = *&Parser.getTok(); - uint64_t OrigDispVal, DotDispVal; +X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, + const MCExpr *&NewDisp) { + const AsmToken &Tok = Parser.getTok(); + int64_t OrigDispVal, DotDispVal; // FIXME: Handle non-constant expressions. - if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) { + if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) OrigDispVal = OrigDisp->getValue(); - } else { - Err = "Non-constant offsets are not supported!"; - return true; - } + else + return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!"); // Drop the '.'. StringRef DotDispStr = Tok.getString().drop_front(1); @@ -1100,23 +1501,15 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, APInt DotDisp; DotDispStr.getAsInteger(10, DotDisp); DotDispVal = DotDisp.getZExtValue(); - } else if (Tok.is(AsmToken::Identifier)) { - // We should only see an identifier when parsing the original inline asm. - // The front-end should rewrite this in terms of immediates. - assert (isParsingInlineAsm() && "Unexpected field name!"); - + } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { unsigned DotDisp; std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, - DotDisp)) { - Err = "Unable to lookup field reference!"; - return true; - } + DotDisp)) + return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!"); DotDispVal = DotDisp; - } else { - Err = "Unexpected token type!"; - return true; - } + } else + return ErrorOperand(Tok.getLoc(), "Unexpected token type!"); if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); @@ -1126,22 +1519,24 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, Val)); } - *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext()); - return false; + NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext()); + return 0; } /// Parse the 'offset' operator. This operator is used to specify the /// location rather then the content of a variable. -X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) { - SMLoc OffsetOfLoc = Start; +X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { + const AsmToken &Tok = Parser.getTok(); + SMLoc OffsetOfLoc = Tok.getLoc(); Parser.Lex(); // Eat offset. - Start = Parser.getTok().getLoc(); - assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier"); - SMLoc End; const MCExpr *Val; - if (getParser().parseExpression(Val, End)) - return ErrorOperand(Start, "Unable to parse expression!"); + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ false, End)) + return Err; // Don't emit the offset operator. InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); @@ -1151,7 +1546,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) { // the size of a pointer. unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX; return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, - OffsetOfLoc); + OffsetOfLoc, Identifier, Info.OpDecl); } enum IntelOperatorKind { @@ -1166,34 +1561,25 @@ enum IntelOperatorKind { /// variable. A variable's size is the product of its LENGTH and TYPE. The /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. -X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) { - SMLoc TypeLoc = Start; - Parser.Lex(); // Eat offset. - Start = Parser.getTok().getLoc(); - assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier"); - - SMLoc End; - const MCExpr *Val; - if (getParser().parseExpression(Val, End)) - return 0; - - unsigned Length = 0, Size = 0, Type = 0; - if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) { - const MCSymbol &Sym = SymRef->getSymbol(); - // FIXME: The SemaLookup will fail if the name is anything other then an - // identifier. - // FIXME: Pass a valid SMLoc. - bool IsVarDecl; - if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Length, - Size, Type, IsVarDecl)) - return ErrorOperand(Start, "Unable to lookup expr!"); - } - unsigned CVal; +X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { + const AsmToken &Tok = Parser.getTok(); + SMLoc TypeLoc = Tok.getLoc(); + Parser.Lex(); // Eat operator. + + const MCExpr *Val = 0; + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated*/ true, End)) + return Err; + + unsigned CVal = 0; switch(OpKind) { default: llvm_unreachable("Unexpected operand kind!"); - case IOK_LENGTH: CVal = Length; break; - case IOK_SIZE: CVal = Size; break; - case IOK_TYPE: CVal = Type; break; + case IOK_LENGTH: CVal = Info.Length; break; + case IOK_SIZE: CVal = Info.Size; break; + case IOK_TYPE: CVal = Info.Type; break; } // Rewrite the type operator and the C or C++ type or variable in terms of an @@ -1202,32 +1588,58 @@ X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) { InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal)); const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext()); - return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false); + return X86Operand::CreateImm(Imm, Start, End); } X86Operand *X86AsmParser::ParseIntelOperand() { - SMLoc Start = Parser.getTok().getLoc(), End; - StringRef AsmTokStr = Parser.getTok().getString(); + const AsmToken &Tok = Parser.getTok(); + SMLoc Start = Tok.getLoc(), End; // Offset, length, type and size operators. if (isParsingInlineAsm()) { + StringRef AsmTokStr = Tok.getString(); if (AsmTokStr == "offset" || AsmTokStr == "OFFSET") - return ParseIntelOffsetOfOperator(Start); + return ParseIntelOffsetOfOperator(); if (AsmTokStr == "length" || AsmTokStr == "LENGTH") - return ParseIntelOperator(Start, IOK_LENGTH); + return ParseIntelOperator(IOK_LENGTH); if (AsmTokStr == "size" || AsmTokStr == "SIZE") - return ParseIntelOperator(Start, IOK_SIZE); + return ParseIntelOperator(IOK_SIZE); if (AsmTokStr == "type" || AsmTokStr == "TYPE") - return ParseIntelOperator(Start, IOK_TYPE); + return ParseIntelOperator(IOK_TYPE); } // Immediate. - if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) || - getLexer().is(AsmToken::Minus)) { - const MCExpr *Val; - if (!getParser().parseExpression(Val, End)) { - return X86Operand::CreateImm(Val, Start, End); + if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || + getLexer().is(AsmToken::LParen)) { + AsmToken StartTok = Tok; + IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, + /*AddImmPrefix=*/false); + if (X86Operand *Err = ParseIntelExpression(SM, End)) + return Err; + + int64_t Imm = SM.getImm(); + if (isParsingInlineAsm()) { + unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); + if (StartTok.getString().size() == Len) + // Just add a prefix if this wasn't a complex immediate expression. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start)); + else + // Otherwise, rewrite the complex expression as a single immediate. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm)); + } + + if (getLexer().isNot(AsmToken::LBrac)) { + const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext()); + return X86Operand::CreateImm(ImmExpr, Start, End); } + + // Only positive immediates are valid. + if (Imm < 0) + return ErrorOperand(Start, "expected a positive immediate displacement " + "before bracketed expr."); + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start); } // Register. @@ -1239,11 +1651,11 @@ X86Operand *X86AsmParser::ParseIntelOperand() { return X86Operand::CreateReg(RegNo, Start, End); getParser().Lex(); // Eat the colon. - return ParseIntelMemOperand(RegNo, Start); + return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start); } // Memory operand. - return ParseIntelMemOperand(0, Start); + return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start); } X86Operand *X86AsmParser::ParseATTOperand() { @@ -1267,7 +1679,6 @@ X86Operand *X86AsmParser::ParseATTOperand() { if (getLexer().isNot(AsmToken::Colon)) return X86Operand::CreateReg(RegNo, Start, End); - getParser().Lex(); // Eat the colon. return ParseMemOperand(RegNo, Start); } @@ -1819,7 +2230,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, assert(!Operands.empty() && "Unexpect empty operand list!"); X86Operand *Op = static_cast<X86Operand*>(Operands[0]); assert(Op->isToken() && "Leading operand should always be a mnemonic!"); - ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>(); + ArrayRef<SMRange> EmptyRanges = None; // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. @@ -1921,25 +2332,25 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned Match1, Match2, Match3, Match4; Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - isParsingIntelSyntax()); + MatchingInlineAsm, isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match1 == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[1]; Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - isParsingIntelSyntax()); + MatchingInlineAsm, isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match2 == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[2]; Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - isParsingIntelSyntax()); + MatchingInlineAsm, isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match3 == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; Tmp[Base.size()] = Suffixes[3]; Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - isParsingIntelSyntax()); + MatchingInlineAsm, isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match4 == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index d14899d..7e20151 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -33,6 +33,7 @@ set(sources X86TargetObjectFile.cpp X86TargetTransformInfo.cpp X86VZeroUpper.cpp + X86FixupLEAs.cpp ) if( CMAKE_CL_64 ) @@ -52,7 +53,7 @@ endif() add_llvm_target(X86CodeGen ${sources}) -add_dependencies(LLVMX86CodeGen intrinsics_gen) +add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen) add_subdirectory(AsmParser) add_subdirectory(Disassembler) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index ca6f80c..82af6fa 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -190,94 +190,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, uint64_t Address, uint64_t Offset, uint64_t Width, MCInst &MI, const MCDisassembler *Dis) { - LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback(); - struct LLVMOpInfo1 SymbolicOp; - memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); - SymbolicOp.Value = Value; - void *DisInfo = Dis->getDisInfoBlock(); - - if (!getOpInfo || - !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) { - // Clear SymbolicOp.Value from above and also all other fields. - memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); - LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); - if (!SymbolLookUp) - return false; - uint64_t ReferenceType; - if (isBranch) - ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; - else - ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; - const char *ReferenceName; - const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, - &ReferenceName); - if (Name) { - SymbolicOp.AddSymbol.Name = Name; - SymbolicOp.AddSymbol.Present = true; - } - // For branches always create an MCExpr so it gets printed as hex address. - else if (isBranch) { - SymbolicOp.Value = Value; - } - if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) - (*Dis->CommentStream) << "symbol stub for: " << ReferenceName; - if (!Name && !isBranch) - return false; - } - - MCContext *Ctx = Dis->getMCContext(); - const MCExpr *Add = NULL; - if (SymbolicOp.AddSymbol.Present) { - if (SymbolicOp.AddSymbol.Name) { - StringRef Name(SymbolicOp.AddSymbol.Name); - MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); - Add = MCSymbolRefExpr::Create(Sym, *Ctx); - } else { - Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx); - } - } - - const MCExpr *Sub = NULL; - if (SymbolicOp.SubtractSymbol.Present) { - if (SymbolicOp.SubtractSymbol.Name) { - StringRef Name(SymbolicOp.SubtractSymbol.Name); - MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); - Sub = MCSymbolRefExpr::Create(Sym, *Ctx); - } else { - Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx); - } - } - - const MCExpr *Off = NULL; - if (SymbolicOp.Value != 0) - Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); - - const MCExpr *Expr; - if (Sub) { - const MCExpr *LHS; - if (Add) - LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); - else - LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); - if (Off != 0) - Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); - else - Expr = LHS; - } else if (Add) { - if (Off != 0) - Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); - else - Expr = Add; - } else { - if (Off != 0) - Expr = Off; - else - Expr = MCConstantExpr::Create(0, *Ctx); - } - - MI.addOperand(MCOperand::CreateExpr(Expr)); - - return true; + return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, + Offset, Width); } /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being @@ -290,15 +204,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, const void *Decoder) { const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); - LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); - if (SymbolLookUp) { - void *DisInfo = Dis->getDisInfoBlock(); - uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load; - const char *ReferenceName; - (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName); - if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) - (*Dis->CommentStream) << "literal pool for: " << ReferenceName; - } + Dis->tryAddingPcLoadReferenceComment(Value, Address); } /// translateImmediate - Appends an immediate operand to an MCInst. @@ -380,6 +286,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_XMM256: mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); return; + case TYPE_XMM512: + mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4))); + return; case TYPE_REL8: isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; @@ -537,6 +446,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, EA_BASES_64BIT REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY } } else { @@ -659,6 +569,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM64: case TYPE_XMM128: case TYPE_XMM256: + case TYPE_XMM512: case TYPE_DEBUGREG: case TYPE_CONTROLREG: return translateRMRegister(mcInst, insn); @@ -777,6 +688,15 @@ static bool translateInstruction(MCInst &mcInst, } mcInst.setOpcode(insn.instructionID); + // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 + // prefix bytes should be disassembled as xrelease and xacquire then set the + // opcode to those instead of the rep and repne opcodes. + if (insn.xAcquireRelease) { + if(mcInst.getOpcode() == X86::REP_PREFIX) + mcInst.setOpcode(X86::XRELEASE_PREFIX); + else if(mcInst.getOpcode() == X86::REPNE_PREFIX) + mcInst.setOpcode(X86::XACQUIRE_PREFIX); + } int index; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index 85d8a99..bb195ee 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -61,7 +61,7 @@ static int modRMRequired(OpcodeType type, InstructionContext insnContext, uint8_t opcode) { const struct ContextDecision* decision = 0; - + switch (type) { case ONEBYTE: decision = &ONEBYTE_SYM; @@ -102,7 +102,7 @@ static InstrUID decode(OpcodeType type, uint8_t opcode, uint8_t modRM) { const struct ModRMDecision* dec = 0; - + switch (type) { case ONEBYTE: dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; @@ -123,7 +123,7 @@ static InstrUID decode(OpcodeType type, dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; break; } - + switch (dec->modrm_type) { default: debug("Corrupt table! Unknown modrm_type"); @@ -171,10 +171,10 @@ static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { */ static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); - + if (!ret) ++(insn->readerCursor); - + return ret; } @@ -238,19 +238,19 @@ CONSUME_FUNC(consumeUInt64, uint64_t) */ static void dbgprintf(struct InternalInstruction* insn, const char* format, - ...) { + ...) { char buffer[256]; va_list ap; - + if (!insn->dlog) return; - + va_start(ap, format); (void)vsnprintf(buffer, sizeof(buffer), format, ap); va_end(ap); - + insn->dlog(insn->dlogArg, buffer); - + return; } @@ -305,27 +305,61 @@ static int readPrefixes(struct InternalInstruction* insn) { BOOL prefixGroups[4] = { FALSE }; uint64_t prefixLocation; uint8_t byte = 0; - + BOOL hasAdSize = FALSE; BOOL hasOpSize = FALSE; - + dbgprintf(insn, "readPrefixes()"); - + while (isPrefix) { prefixLocation = insn->readerCursor; - + if (consumeByte(insn, &byte)) return -1; /* - * If the first byte is a LOCK prefix break and let it be disassembled - * as a lock "instruction", by creating an <MCInst #xxxx LOCK_PREFIX>. - * FIXME there is currently no way to get the disassembler to print the - * lock prefix if it is not the first byte. + * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then + * break and let it be disassembled as a normal "instruction". */ - if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) - break; - + if (insn->readerCursor - 1 == insn->startLocation + && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) { + uint8_t nextByte; + if (byte == 0xf0) + break; + if (lookAtByte(insn, &nextByte)) + return -1; + /* + * If the byte is 0xf2 or 0xf3, and any of the following conditions are + * met: + * - it is followed by a LOCK (0xf0) prefix + * - it is followed by an xchg instruction + * then it should be disassembled as a xacquire/xrelease not repne/rep. + */ + if ((byte == 0xf2 || byte == 0xf3) && + ((nextByte == 0xf0) | + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) + insn->xAcquireRelease = TRUE; + /* + * Also if the byte is 0xf3, and the following condition is met: + * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or + * "mov mem, imm" (opcode 0xc6/0xc7) instructions. + * then it should be disassembled as an xrelease not rep. + */ + if (byte == 0xf3 && + (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) + insn->xAcquireRelease = TRUE; + if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { + if (consumeByte(insn, &nextByte)) + return -1; + if (lookAtByte(insn, &nextByte)) + return -1; + unconsumeByte(insn); + } + if (nextByte != 0x0f && nextByte != 0x90) + break; + } + switch (byte) { case 0xf0: /* LOCK */ case 0xf2: /* REPNE/REPNZ */ @@ -387,21 +421,21 @@ static int readPrefixes(struct InternalInstruction* insn) { isPrefix = FALSE; break; } - + if (isPrefix) dbgprintf(insn, "Found prefix 0x%hhx", byte); } - + insn->vexSize = 0; - + if (byte == 0xc4) { uint8_t byte1; - + if (lookAtByte(insn, &byte1)) { dbgprintf(insn, "Couldn't read second byte of VEX"); return -1; } - + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vexSize = 3; insn->necessaryPrefixLocation = insn->readerCursor - 1; @@ -410,67 +444,67 @@ static int readPrefixes(struct InternalInstruction* insn) { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } - + if (insn->vexSize == 3) { insn->vexPrefix[0] = byte; consumeByte(insn, &insn->vexPrefix[1]); consumeByte(insn, &insn->vexPrefix[2]); /* We simulate the REX prefix for simplicity's sake */ - + if (insn->mode == MODE_64BIT) { - insn->rexPrefix = 0x40 + insn->rexPrefix = 0x40 | (wFromVEX3of3(insn->vexPrefix[2]) << 3) | (rFromVEX2of3(insn->vexPrefix[1]) << 2) | (xFromVEX2of3(insn->vexPrefix[1]) << 1) | (bFromVEX2of3(insn->vexPrefix[1]) << 0); } - + switch (ppFromVEX3of3(insn->vexPrefix[2])) { default: break; case VEX_PREFIX_66: - hasOpSize = TRUE; + hasOpSize = TRUE; break; } - + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]); } } else if (byte == 0xc5) { uint8_t byte1; - + if (lookAtByte(insn, &byte1)) { dbgprintf(insn, "Couldn't read second byte of VEX"); return -1; } - + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vexSize = 2; } else { unconsumeByte(insn); } - + if (insn->vexSize == 2) { insn->vexPrefix[0] = byte; consumeByte(insn, &insn->vexPrefix[1]); - + if (insn->mode == MODE_64BIT) { - insn->rexPrefix = 0x40 + insn->rexPrefix = 0x40 | (rFromVEX2of2(insn->vexPrefix[1]) << 2); } - + switch (ppFromVEX2of2(insn->vexPrefix[1])) { default: break; case VEX_PREFIX_66: - hasOpSize = TRUE; + hasOpSize = TRUE; break; } - + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]); } } @@ -478,17 +512,17 @@ static int readPrefixes(struct InternalInstruction* insn) { if (insn->mode == MODE_64BIT) { if ((byte & 0xf0) == 0x40) { uint8_t opcodeByte; - + if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { dbgprintf(insn, "Redundant REX prefix"); return -1; } - + insn->rexPrefix = byte; insn->necessaryPrefixLocation = insn->readerCursor - 2; - + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); - } else { + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } @@ -526,7 +560,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->immediateSize = (hasOpSize ? 2 : 4); } } - + return 0; } @@ -537,22 +571,22 @@ static int readPrefixes(struct InternalInstruction* insn) { * @param insn - The instruction whose opcode is to be read. * @return - 0 if the opcode could be read successfully; nonzero otherwise. */ -static int readOpcode(struct InternalInstruction* insn) { +static int readOpcode(struct InternalInstruction* insn) { /* Determine the length of the primary opcode */ - + uint8_t current; - + dbgprintf(insn, "readOpcode()"); - + insn->opcodeType = ONEBYTE; - + if (insn->vexSize == 3) { switch (mmmmmFromVEX2of3(insn->vexPrefix[1])) { default: dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1])); - return -1; + return -1; case 0: break; case VEX_LOB_0F: @@ -564,7 +598,7 @@ static int readOpcode(struct InternalInstruction* insn) { insn->threeByteEscape = 0x38; insn->opcodeType = THREEBYTE_38; return consumeByte(insn, &insn->opcode); - case VEX_LOB_0F3A: + case VEX_LOB_0F3A: insn->twoByteEscape = 0x0f; insn->threeByteEscape = 0x3a; insn->opcodeType = THREEBYTE_3A; @@ -577,68 +611,68 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = TWOBYTE; return consumeByte(insn, &insn->opcode); } - + if (consumeByte(insn, ¤t)) return -1; - + if (current == 0x0f) { dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); - + insn->twoByteEscape = current; - + if (consumeByte(insn, ¤t)) return -1; - + if (current == 0x38) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - + insn->threeByteEscape = current; - + if (consumeByte(insn, ¤t)) return -1; - + insn->opcodeType = THREEBYTE_38; } else if (current == 0x3a) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - + insn->threeByteEscape = current; - + if (consumeByte(insn, ¤t)) return -1; - + insn->opcodeType = THREEBYTE_3A; } else if (current == 0xa6) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - + insn->threeByteEscape = current; - + if (consumeByte(insn, ¤t)) return -1; - + insn->opcodeType = THREEBYTE_A6; } else if (current == 0xa7) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - + insn->threeByteEscape = current; - + if (consumeByte(insn, ¤t)) return -1; - + insn->opcodeType = THREEBYTE_A7; } else { dbgprintf(insn, "Didn't find a three-byte escape prefix"); - + insn->opcodeType = TWOBYTE; } } - + /* * At this point we have consumed the full opcode. * Anything we consume from here on must be unconsumed. */ - + insn->opcode = current; - + return 0; } @@ -660,19 +694,19 @@ static int getIDWithAttrMask(uint16_t* instructionID, struct InternalInstruction* insn, uint8_t attrMask) { BOOL hasModRMExtension; - + uint8_t instructionClass; instructionClass = contextForAttrs(attrMask); - + hasModRMExtension = modRMRequired(insn->opcodeType, instructionClass, insn->opcode); - + if (hasModRMExtension) { if (readModRM(insn)) return -1; - + *instructionID = decode(insn->opcodeType, instructionClass, insn->opcode, @@ -683,7 +717,7 @@ static int getIDWithAttrMask(uint16_t* instructionID, insn->opcode, 0); } - + return 0; } @@ -696,7 +730,7 @@ static int getIDWithAttrMask(uint16_t* instructionID, */ static BOOL is16BitEquivalent(const char* orig, const char* equiv) { off_t i; - + for (i = 0;; i++) { if (orig[i] == '\0' && equiv[i] == '\0') return TRUE; @@ -715,8 +749,8 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) { } /* - * getID - Determines the ID of an instruction, consuming the ModR/M byte as - * appropriate for extended and escape opcodes. Determines the attributes and + * getID - Determines the ID of an instruction, consuming the ModR/M byte as + * appropriate for extended and escape opcodes. Determines the attributes and * context for the instruction before doing so. * * @param insn - The instruction whose ID is to be determined. @@ -726,21 +760,21 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) { static int getID(struct InternalInstruction* insn, const void *miiArg) { uint8_t attrMask; uint16_t instructionID; - + dbgprintf(insn, "getID()"); - + attrMask = ATTR_NONE; if (insn->mode == MODE_64BIT) attrMask |= ATTR_64BIT; - + if (insn->vexSize) { attrMask |= ATTR_VEX; if (insn->vexSize == 3) { switch (ppFromVEX3of3(insn->vexPrefix[2])) { case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; + attrMask |= ATTR_OPSIZE; break; case VEX_PREFIX_F3: attrMask |= ATTR_XS; @@ -749,14 +783,14 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { attrMask |= ATTR_XD; break; } - + if (lFromVEX3of3(insn->vexPrefix[2])) attrMask |= ATTR_VEXL; } else if (insn->vexSize == 2) { switch (ppFromVEX2of2(insn->vexPrefix[1])) { case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; + attrMask |= ATTR_OPSIZE; break; case VEX_PREFIX_F3: attrMask |= ATTR_XS; @@ -765,7 +799,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { attrMask |= ATTR_XD; break; } - + if (lFromVEX2of2(insn->vexPrefix[1])) attrMask |= ATTR_VEXL; } @@ -836,26 +870,26 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { * conservative, but in the specific case where OpSize is present but not * in the right place we check if there's a 16-bit operation. */ - + const struct InstructionSpecifier *spec; uint16_t instructionIDWithOpsize; const char *specName, *specWithOpSizeName; - + spec = specifierForUID(instructionID); - + if (getIDWithAttrMask(&instructionIDWithOpsize, insn, attrMask | ATTR_OPSIZE)) { - /* + /* * ModRM required with OpSize but not present; give up and return version * without OpSize set */ - + insn->instructionID = instructionID; insn->spec = spec; return 0; } - + specName = x86DisassemblerGetInstrName(instructionID, miiArg); specWithOpSizeName = x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg); @@ -882,10 +916,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { const struct InstructionSpecifier *specWithNewOpcode; spec = specifierForUID(instructionID); - + /* Borrow opcode from one of the other XCHGar opcodes */ insn->opcode = 0x91; - + if (getIDWithAttrMask(&instructionIDWithNewOpcode, insn, attrMask)) { @@ -906,10 +940,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { return 0; } - + insn->instructionID = instructionID; insn->spec = specifierForUID(insn->instructionID); - + return 0; } @@ -924,14 +958,14 @@ static int readSIB(struct InternalInstruction* insn) { SIBIndex sibIndexBase = 0; SIBBase sibBaseBase = 0; uint8_t index, base; - + dbgprintf(insn, "readSIB()"); - + if (insn->consumedSIB) return 0; - + insn->consumedSIB = TRUE; - + switch (insn->addressSize) { case 2: dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); @@ -949,9 +983,9 @@ static int readSIB(struct InternalInstruction* insn) { if (consumeByte(insn, &insn->sib)) return -1; - + index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); - + switch (index) { case 0x4: insn->sibIndex = SIB_INDEX_NONE; @@ -963,7 +997,7 @@ static int readSIB(struct InternalInstruction* insn) { insn->sibIndex = SIB_INDEX_NONE; break; } - + switch (scaleFromSIB(insn->sib)) { case 0: insn->sibScale = 1; @@ -978,9 +1012,9 @@ static int readSIB(struct InternalInstruction* insn) { insn->sibScale = 8; break; } - + base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); - + switch (base) { case 0x5: switch (modFromModRM(insn->modRM)) { @@ -990,12 +1024,12 @@ static int readSIB(struct InternalInstruction* insn) { break; case 0x1: insn->eaDisplacement = EA_DISP_8; - insn->sibBase = (insn->addressSize == 4 ? + insn->sibBase = (insn->addressSize == 4 ? SIB_BASE_EBP : SIB_BASE_RBP); break; case 0x2: insn->eaDisplacement = EA_DISP_32; - insn->sibBase = (insn->addressSize == 4 ? + insn->sibBase = (insn->addressSize == 4 ? SIB_BASE_EBP : SIB_BASE_RBP); break; case 0x3: @@ -1007,7 +1041,7 @@ static int readSIB(struct InternalInstruction* insn) { insn->sibBase = (SIBBase)(sibBaseBase + base); break; } - + return 0; } @@ -1015,22 +1049,22 @@ static int readSIB(struct InternalInstruction* insn) { * readDisplacement - Consumes the displacement of an instruction. * * @param insn - The instruction whose displacement is to be read. - * @return - 0 if the displacement byte was successfully read; nonzero + * @return - 0 if the displacement byte was successfully read; nonzero * otherwise. */ -static int readDisplacement(struct InternalInstruction* insn) { +static int readDisplacement(struct InternalInstruction* insn) { int8_t d8; int16_t d16; int32_t d32; - + dbgprintf(insn, "readDisplacement()"); - + if (insn->consumedDisplacement) return 0; - + insn->consumedDisplacement = TRUE; insn->displacementOffset = insn->readerCursor - insn->startLocation; - + switch (insn->eaDisplacement) { case EA_DISP_NONE: insn->consumedDisplacement = FALSE; @@ -1051,7 +1085,7 @@ static int readDisplacement(struct InternalInstruction* insn) { insn->displacement = d32; break; } - + insn->consumedDisplacement = TRUE; return 0; } @@ -1063,22 +1097,22 @@ static int readDisplacement(struct InternalInstruction* insn) { * @param insn - The instruction whose addressing information is to be read. * @return - 0 if the information was successfully read; nonzero otherwise. */ -static int readModRM(struct InternalInstruction* insn) { +static int readModRM(struct InternalInstruction* insn) { uint8_t mod, rm, reg; - + dbgprintf(insn, "readModRM()"); - + if (insn->consumedModRM) return 0; - + if (consumeByte(insn, &insn->modRM)) return -1; insn->consumedModRM = TRUE; - + mod = modFromModRM(insn->modRM); rm = rmFromModRM(insn->modRM); reg = regFromModRM(insn->modRM); - + /* * This goes by insn->registerSize to pick the correct register, which messes * up if we're using (say) XMM or 8-bit register operands. That gets fixed in @@ -1098,16 +1132,16 @@ static int readModRM(struct InternalInstruction* insn) { insn->eaRegBase = EA_REG_RAX; break; } - + reg |= rFromREX(insn->rexPrefix) << 3; rm |= bFromREX(insn->rexPrefix) << 3; - + insn->reg = (Reg)(insn->regBase + reg); - + switch (insn->addressSize) { case 2: insn->eaBaseBase = EA_BASE_BX_SI; - + switch (mod) { case 0x0: if (rm == 0x6) { @@ -1142,14 +1176,14 @@ static int readModRM(struct InternalInstruction* insn) { case 4: case 8: insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); - + switch (mod) { case 0x0: insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ switch (rm) { case 0x4: case 0xc: /* in case REXW.b is set */ - insn->eaBase = (insn->addressSize == 4 ? + insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64); readSIB(insn); if (readDisplacement(insn)) @@ -1191,7 +1225,7 @@ static int readModRM(struct InternalInstruction* insn) { } break; } /* switch (insn->addressSize) */ - + return 0; } @@ -1221,6 +1255,8 @@ static int readModRM(struct InternalInstruction* insn) { return prefix##_EAX + index; \ case TYPE_R64: \ return prefix##_RAX + index; \ + case TYPE_XMM512: \ + return prefix##_ZMM0 + index; \ case TYPE_XMM256: \ return prefix##_YMM0 + index; \ case TYPE_XMM128: \ @@ -1274,12 +1310,12 @@ GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) * @return - 0 if fixup was successful; -1 if the register returned was * invalid for its class. */ -static int fixupReg(struct InternalInstruction *insn, +static int fixupReg(struct InternalInstruction *insn, const struct OperandSpecifier *op) { uint8_t valid; - + dbgprintf(insn, "fixupReg()"); - + switch ((OperandEncoding)op->encoding) { default: debug("Expected a REG or R/M encoding in fixupReg"); @@ -1311,12 +1347,12 @@ static int fixupReg(struct InternalInstruction *insn, } break; } - + return 0; } /* - * readOpcodeModifier - Reads an operand from the opcode field of an + * readOpcodeModifier - Reads an operand from the opcode field of an * instruction. Handles AddRegFrm instructions. * * @param insn - The instruction whose opcode field is to be read. @@ -1326,12 +1362,12 @@ static int fixupReg(struct InternalInstruction *insn, */ static int readOpcodeModifier(struct InternalInstruction* insn) { dbgprintf(insn, "readOpcodeModifier()"); - + if (insn->consumedOpcodeModifier) return 0; - + insn->consumedOpcodeModifier = TRUE; - + switch (insn->spec->modifierType) { default: debug("Unknown modifier type."); @@ -1345,11 +1381,11 @@ static int readOpcodeModifier(struct InternalInstruction* insn) { case MODIFIER_MODRM: insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; return 0; - } + } } /* - * readOpcodeRegister - Reads an operand from the opcode field of an + * readOpcodeRegister - Reads an operand from the opcode field of an * instruction and interprets it appropriately given the operand width. * Handles AddRegFrm instructions. * @@ -1364,39 +1400,39 @@ static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { if (readOpcodeModifier(insn)) return -1; - + if (size == 0) size = insn->registerSize; - + switch (size) { case 1: - insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) + insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) | insn->opcodeModifier)); - if (insn->rexPrefix && + if (insn->rexPrefix && insn->opcodeRegister >= MODRM_REG_AL + 0x4 && insn->opcodeRegister < MODRM_REG_AL + 0x8) { insn->opcodeRegister = (Reg)(MODRM_REG_SPL + (insn->opcodeRegister - MODRM_REG_AL - 4)); } - + break; case 2: insn->opcodeRegister = (Reg)(MODRM_REG_AX - + ((bFromREX(insn->rexPrefix) << 3) + + ((bFromREX(insn->rexPrefix) << 3) | insn->opcodeModifier)); break; case 4: insn->opcodeRegister = (Reg)(MODRM_REG_EAX - + ((bFromREX(insn->rexPrefix) << 3) + + ((bFromREX(insn->rexPrefix) << 3) | insn->opcodeModifier)); break; case 8: - insn->opcodeRegister = (Reg)(MODRM_REG_RAX - + ((bFromREX(insn->rexPrefix) << 3) + insn->opcodeRegister = (Reg)(MODRM_REG_RAX + + ((bFromREX(insn->rexPrefix) << 3) | insn->opcodeModifier)); break; } - + return 0; } @@ -1414,20 +1450,20 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { uint16_t imm16; uint32_t imm32; uint64_t imm64; - + dbgprintf(insn, "readImmediate()"); - + if (insn->numImmediatesConsumed == 2) { debug("Already consumed two immediates"); return -1; } - + if (size == 0) size = insn->immediateSize; else insn->immediateSize = size; insn->immediateOffset = insn->readerCursor - insn->startLocation; - + switch (size) { case 1: if (consumeByte(insn, &imm8)) @@ -1450,9 +1486,9 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { insn->immediates[insn->numImmediatesConsumed] = imm64; break; } - + insn->numImmediatesConsumed++; - + return 0; } @@ -1465,7 +1501,7 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { */ static int readVVVV(struct InternalInstruction* insn) { dbgprintf(insn, "readVVVV()"); - + if (insn->vexSize == 3) insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]); else if (insn->vexSize == 2) @@ -1490,14 +1526,14 @@ static int readOperands(struct InternalInstruction* insn) { int index; int hasVVVV, needVVVV; int sawRegImm = 0; - + dbgprintf(insn, "readOperands()"); /* If non-zero vvvv specified, need to make sure one of the operands uses it. */ hasVVVV = !readVVVV(insn); needVVVV = hasVVVV && (insn->vvvv != 0); - + for (index = 0; index < X86_MAX_OPERANDS; ++index) { switch (x86OperandSets[insn->spec->operands][index].encoding) { case ENCODING_NONE: @@ -1599,7 +1635,7 @@ static int readOperands(struct InternalInstruction* insn) { /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */ if (needVVVV) return -1; - + return 0; } @@ -1607,7 +1643,7 @@ static int readOperands(struct InternalInstruction* insn) { * decodeInstruction - Reads and interprets a full instruction provided by the * user. * - * @param insn - A pointer to the instruction to be populated. Must be + * @param insn - A pointer to the instruction to be populated. Must be * pre-allocated. * @param reader - The function to be used to read the instruction's bytes. * @param readerArg - A generic argument to be passed to the reader to store @@ -1632,7 +1668,7 @@ int decodeInstruction(struct InternalInstruction* insn, uint64_t startLoc, DisassemblerMode mode) { memset(insn, 0, sizeof(struct InternalInstruction)); - + insn->reader = reader; insn->readerArg = readerArg; insn->dlog = logger; @@ -1641,7 +1677,7 @@ int decodeInstruction(struct InternalInstruction* insn, insn->readerCursor = startLoc; insn->mode = mode; insn->numImmediatesConsumed = 0; - + if (readPrefixes(insn) || readOpcode(insn) || getID(insn, miiArg) || @@ -1650,14 +1686,14 @@ int decodeInstruction(struct InternalInstruction* insn, return -1; insn->operands = &x86OperandSets[insn->spec->operands][0]; - + insn->length = insn->readerCursor - insn->startLocation; - + dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", startLoc, insn->readerCursor, insn->length); - + if (insn->length > 15) dbgprintf(insn, "Instruction exceeds 15-byte limit"); - + return 0; } diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 407ead3..dcb6aad 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -219,7 +219,23 @@ extern "C" { ENTRY(XMM12) \ ENTRY(XMM13) \ ENTRY(XMM14) \ - ENTRY(XMM15) + ENTRY(XMM15) \ + ENTRY(XMM16) \ + ENTRY(XMM17) \ + ENTRY(XMM18) \ + ENTRY(XMM19) \ + ENTRY(XMM20) \ + ENTRY(XMM21) \ + ENTRY(XMM22) \ + ENTRY(XMM23) \ + ENTRY(XMM24) \ + ENTRY(XMM25) \ + ENTRY(XMM26) \ + ENTRY(XMM27) \ + ENTRY(XMM28) \ + ENTRY(XMM29) \ + ENTRY(XMM30) \ + ENTRY(XMM31) #define REGS_YMM \ ENTRY(YMM0) \ @@ -237,7 +253,57 @@ extern "C" { ENTRY(YMM12) \ ENTRY(YMM13) \ ENTRY(YMM14) \ - ENTRY(YMM15) + ENTRY(YMM15) \ + ENTRY(YMM16) \ + ENTRY(YMM17) \ + ENTRY(YMM18) \ + ENTRY(YMM19) \ + ENTRY(YMM20) \ + ENTRY(YMM21) \ + ENTRY(YMM22) \ + ENTRY(YMM23) \ + ENTRY(YMM24) \ + ENTRY(YMM25) \ + ENTRY(YMM26) \ + ENTRY(YMM27) \ + ENTRY(YMM28) \ + ENTRY(YMM29) \ + ENTRY(YMM30) \ + ENTRY(YMM31) + +#define REGS_ZMM \ + ENTRY(ZMM0) \ + ENTRY(ZMM1) \ + ENTRY(ZMM2) \ + ENTRY(ZMM3) \ + ENTRY(ZMM4) \ + ENTRY(ZMM5) \ + ENTRY(ZMM6) \ + ENTRY(ZMM7) \ + ENTRY(ZMM8) \ + ENTRY(ZMM9) \ + ENTRY(ZMM10) \ + ENTRY(ZMM11) \ + ENTRY(ZMM12) \ + ENTRY(ZMM13) \ + ENTRY(ZMM14) \ + ENTRY(ZMM15) \ + ENTRY(ZMM16) \ + ENTRY(ZMM17) \ + ENTRY(ZMM18) \ + ENTRY(ZMM19) \ + ENTRY(ZMM20) \ + ENTRY(ZMM21) \ + ENTRY(ZMM22) \ + ENTRY(ZMM23) \ + ENTRY(ZMM24) \ + ENTRY(ZMM25) \ + ENTRY(ZMM26) \ + ENTRY(ZMM27) \ + ENTRY(ZMM28) \ + ENTRY(ZMM29) \ + ENTRY(ZMM30) \ + ENTRY(ZMM31) #define REGS_SEGMENT \ ENTRY(ES) \ @@ -285,6 +351,7 @@ extern "C" { REGS_MMX \ REGS_XMM \ REGS_YMM \ + REGS_ZMM \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ @@ -319,6 +386,7 @@ typedef enum { ALL_EA_BASES REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY SIB_INDEX_max } SIBIndex; @@ -457,6 +525,8 @@ struct InternalInstruction { uint64_t necessaryPrefixLocation; /* The segment override type */ SegmentOverride segmentOverride; + /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */ + BOOL xAcquireRelease; /* Sizes of various critical pieces of data, in bytes */ uint8_t registerSize; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 23dfe4b..d291441 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -116,8 +116,106 @@ enum attributeBits { ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ - ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") - + ENUM_ENTRY(IC_VEX_L_W, 3, "requires VEX, L and W") \ + ENUM_ENTRY(IC_VEX_L_W_XS, 4, "requires VEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_VEX_L_W_XD, 4, "requires VEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 4, "requires VEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \ + ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \ + ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, typedef enum { @@ -224,6 +322,7 @@ struct ContextDecision { ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ ENUM_ENTRY(ENCODING_CW, "2-byte") \ ENUM_ENTRY(ENCODING_CD, "4-byte") \ @@ -321,6 +420,9 @@ struct ContextDecision { ENUM_ENTRY(TYPE_XMM64, "8-byte") \ ENUM_ENTRY(TYPE_XMM128, "16-byte") \ ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM512, "64-byte") \ + ENUM_ENTRY(TYPE_VK8, "8-bit") \ + ENUM_ENTRY(TYPE_VK16, "16-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index e357710..b9d0082 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Try to print any aliases first. if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); - + // Next always print the annotation. printAnnotation(OS, Annot); @@ -139,8 +139,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + O << formatHex((uint64_t)Address); } else { // Otherwise, just print the expression. @@ -159,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); - + if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); - + } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << markup("<imm:") @@ -177,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &IndexReg = MI->getOperand(Op+2); const MCOperand &DispSpec = MI->getOperand(Op+3); const MCOperand &SegReg = MI->getOperand(Op+4); - + O << markup("<mem:"); // If this has a segment register, print it. @@ -185,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, printOperand(MI, Op+4, O); O << ':'; } - + if (DispSpec.isImm()) { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) @@ -194,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); O << *DispSpec.getExpr(); } - + if (IndexReg.getReg() || BaseReg.getReg()) { O << '('; if (BaseReg.getReg()) printOperand(MI, Op, O); - + if (IndexReg.getReg()) { O << ','; printOperand(MI, Op+2, O); unsigned ScaleVal = MI->getOperand(Op+1).getImm(); if (ScaleVal != 1) { O << ',' - << markup("<imm:") + << markup("<imm:") << ScaleVal // never printed in hex. - << markup(">"); + << markup(">"); } } O << ')'; diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 8e09183..8d05256 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -65,6 +65,9 @@ public: void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -80,6 +83,9 @@ public: void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } }; } diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 141f4a4..9dfc9a9 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -119,7 +119,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - O << Op.getImm(); + O << formatImm(Op.getImm()); else { assert(Op.isExpr() && "unknown pcrel immediate operand"); // If a symbolic branch target was added as a constant expression then print @@ -127,8 +127,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + O << formatHex((uint64_t)Address); } else { // Otherwise, just print the expression. @@ -137,18 +136,13 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, } } -static void PrintRegName(raw_ostream &O, StringRef RegName) { - for (unsigned i = 0, e = RegName.size(); i != e; ++i) - O << (char)toupper(RegName[i]); -} - void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - PrintRegName(O, getRegisterName(Op.getReg())); + printRegName(O, Op.getReg()); } else if (Op.isImm()) { - O << Op.getImm(); + O << formatImm((int64_t)Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << *Op.getExpr(); @@ -200,7 +194,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, DispVal = -DispVal; } } - O << DispVal; + O << formatImm(DispVal); } } diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index bb769eb..45beeda 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -41,52 +41,60 @@ public: void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "OPAQUE PTR "; + O << "opaque ptr "; printMemReference(MI, OpNo, O); } void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "BYTE PTR "; + O << "byte ptr "; printMemReference(MI, OpNo, O); } void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "WORD PTR "; + O << "word ptr "; printMemReference(MI, OpNo, O); } void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XWORD PTR "; + O << "xword ptr "; printMemReference(MI, OpNo, O); } void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } }; diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk index 39478b3..31ba842 100644 --- a/lib/Target/X86/MCTargetDesc/Android.mk +++ b/lib/Target/X86/MCTargetDesc/Android.mk @@ -8,9 +8,11 @@ x86_mc_desc_TBLGEN_TABLES := \ x86_mc_desc_SRC_FILES := \ X86AsmBackend.cpp \ X86ELFObjectWriter.cpp \ + X86ELFRelocationInfo.cpp \ X86MCTargetDesc.cpp \ X86MCAsmInfo.cpp \ X86MCCodeEmitter.cpp \ + X86MachORelocationInfo.cpp \ X86MachObjectWriter.cpp \ X86WinCOFFObjectWriter.cpp diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt index 1c240e5..2eb5f25 100644 --- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt @@ -6,6 +6,8 @@ add_llvm_library(LLVMX86Desc X86MachObjectWriter.cpp X86ELFObjectWriter.cpp X86WinCOFFObjectWriter.cpp + X86MachORelocationInfo.cpp + X86ELFRelocationInfo.cpp ) add_dependencies(LLVMX86Desc X86CommonTableGen) diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 9e68388..25d1af3 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -20,6 +20,7 @@ #include "X86MCTargetDesc.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/MC/MCInstrInfo.h" namespace llvm { @@ -41,7 +42,6 @@ namespace X86 { AddrNumOperands = 5 }; } // end namespace X86; - /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. @@ -274,11 +274,12 @@ namespace X86II { //// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, - MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40, - MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46, - MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50, - MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54, - MRM_DE = 55, MRM_DF = 56, + MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, MRM_CB = 40, + MRM_E8 = 41, MRM_F0 = 42, MRM_F8 = 45, MRM_F9 = 46, + MRM_D0 = 47, MRM_D1 = 48, MRM_D4 = 49, MRM_D5 = 50, + MRM_D6 = 51, MRM_D8 = 52, MRM_D9 = 53, MRM_DA = 54, + MRM_DB = 55, MRM_DC = 56, MRM_DD = 57, MRM_DE = 58, + MRM_DF = 59, /// RawFrmImm8 - This is used for the ENTER instruction, which has two /// immediates, the first of which is a 16-bit immediate (specified by @@ -461,20 +462,54 @@ namespace X86II { // prefix. Usually used for scalar instructions. Needed by disassembler. VEX_LIG = 1U << 6, + // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field + // with following encoding: + // - 00 V128 + // - 01 V256 + // - 10 V512 + // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros. + // this will save 1 tsflag bit + + // VEX_EVEX - Specifies that this instruction use EVEX form which provides + // syntax support up to 32 512-bit register operands and up to 7 16-bit + // mask operands as well as source operand data swizzling/memory operand + // conversion, eviction hint, and rounding mode. + EVEX = 1U << 7, + + // EVEX_K - Set if this instruction requires masking + EVEX_K = 1U << 8, + + // EVEX_Z - Set if this instruction has EVEX.Z field set. + EVEX_Z = 1U << 9, + + // EVEX_L2 - Set if this instruction has EVEX.L' field set. + EVEX_L2 = 1U << 10, + + // EVEX_B - Set if this instruction has EVEX.B field set. + EVEX_B = 1U << 11, + + // EVEX_CD8E - compressed disp8 form, element-size + EVEX_CD8EShift = VEXShift + 12, + EVEX_CD8EMask = 3, + + // EVEX_CD8V - compressed disp8 form, vector-width + EVEX_CD8VShift = EVEX_CD8EShift + 2, + EVEX_CD8VMask = 7, + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 7, + Has3DNow0F0FOpcode = 1U << 17, /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4 = 1U << 8, + MemOp4 = 1U << 18, /// XOP - Opcode prefix used by XOP instructions. - XOP = 1U << 9 + XOP = 1U << 19 }; @@ -521,6 +556,33 @@ namespace X86II { } } + /// getOperandBias - compute any additional adjustment needed to + /// the offset to the start of the memory operand + /// in this instruction. + /// If this is a two-address instruction,skip one of the register operands. + /// FIXME: This should be handled during MCInst lowering. + inline int getOperandBias(const MCInstrDesc& Desc) + { + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; + return CurOp; + } + /// getMemoryOperandNo - The function returns the MCInst operand # for the /// first field of the memory operand. If the instruction doesn't have a /// memory operand, this returns -1. @@ -548,12 +610,15 @@ namespace X86II { case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). if (HasMemOp4) ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). - + if (HasEVEX_K) + ++FirstMemOp;// Skip the mask register // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || // Opcode == X86::LEA16r || Opcode == X86::LEA32r) @@ -574,17 +639,15 @@ namespace X86II { ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). return FirstMemOp; } - case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: - case X86II::MRM_C8: case X86II::MRM_C9: - case X86II::MRM_E8: case X86II::MRM_F0: - case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: - case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: - case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: + case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_E8: + case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9: + case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: + case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: + case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: + case X86II::MRM_DF: return -1; } } @@ -592,6 +655,14 @@ namespace X86II { /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or /// higher) register? e.g. r8, xmm8, xmm13, etc. inline bool isX86_64ExtendedReg(unsigned RegNo) { + if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) || + (RegNo > X86::XMM23 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM7 && RegNo <= X86::YMM15) || + (RegNo > X86::YMM23 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) || + (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31)) + return true; + switch (RegNo) { default: break; case X86::R8: case X86::R9: case X86::R10: case X86::R11: @@ -602,16 +673,21 @@ namespace X86II { case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: - case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: - case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: - case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: - case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: return true; } return false; } + + /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher) + /// registers? e.g. zmm21, etc. + static inline bool is32ExtendedReg(unsigned RegNo) { + return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM15 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); + } + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index de80dd8..b400b87 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -101,7 +101,18 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, } else { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); - case FK_Data_8: Type = ELF::R_X86_64_64; break; + case FK_Data_8: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_64; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_X86_64_DTPOFF64; + break; + } + break; case X86::reloc_signed_4byte: switch (Modifier) { default: diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp new file mode 100644 index 0000000..8f4ab46 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -0,0 +1,135 @@ +//===-- X86ELFRelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/Object/ELF.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; +using namespace object; +using namespace ELF; + +namespace { +class X86_64ELFRelocationInfo : public MCRelocationInfo { +public: + X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) { + uint64_t RelType; Rel.getType(RelType); + symbol_iterator SymI = Rel.getSymbol(); + + StringRef SymName; SymI->getName(SymName); + uint64_t SymAddr; SymI->getAddress(SymAddr); + uint64_t SymSize; SymI->getSize(SymSize); + int64_t Addend; getELFRelocationAddend(Rel, Addend); + + MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (Sym->isVariable() == false) + Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); + + const MCExpr *Expr = 0; + // If hasAddend is true, then we need to add Addend (r_addend) to Expr. + bool hasAddend = false; + + // The AMD64 SysV ABI says: + // A: the addend used to compute the value of the relocatable field. + // B: the base address at which a shared object has been loaded into memory + // during execution. Generally, a shared object is built with a 0 base + // virtual address, but the execution address will be different. + // G: the offset into the global offset table at which the relocation + // entry's symbol will reside during execution. + // GOT: the address of the global offset table. + // L: the place (section offset or address) of the Procedure Linkage Table + // entry for a symbol. + // P: the place (section offset or address) of the storage unit being + // relocated (computed using r_offset). + // S: the value of the symbol whose index resides in the relocation entry. + // Z: the size of the symbol whose index resides in the relocation entry. + + switch(RelType) { + case R_X86_64_NONE: + case R_X86_64_COPY: + // none + break; + case R_X86_64_64: + case R_X86_64_16: + case R_X86_64_8: + // S + A + case R_X86_64_32: + case R_X86_64_32S: + // S + A (We don't care about the result not fitting in 32 bits.) + case R_X86_64_PC32: + case R_X86_64_PC16: + case R_X86_64_PC8: + case R_X86_64_PC64: + // S + A - P (P/pcrel is implicit) + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + case R_X86_64_GOT32: + case R_X86_64_GOT64: + case R_X86_64_GOTPC32: + case R_X86_64_GOTPC64: + case R_X86_64_GOTPLT64: + // G + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx); + break; + case R_X86_64_PLT32: + // L + A - P -> S@PLT + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx); + break; + case R_X86_64_GLOB_DAT: + case R_X86_64_JUMP_SLOT: + // S + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + case R_X86_64_GOTPCREL: + case R_X86_64_GOTPCREL64: + // G + GOT + A - P -> S@GOTPCREL + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case R_X86_64_GOTOFF64: + // S + A - GOT + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx); + break; + case R_X86_64_PLTOFF64: + // L + A - GOT + break; + case R_X86_64_SIZE32: + case R_X86_64_SIZE64: + // Z + A + Expr = MCConstantExpr::Create(SymSize, Ctx); + break; + default: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + } + if (Expr && hasAddend && Addend != 0) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(Addend, Ctx), + Ctx); + return Expr; + } +}; +} // End unnamed namespace + +/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) { + // We only handle x86-64 for now. + return new X86_64ELFRelocationInfo(Ctx); +} diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 5fbefae..8515879 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -53,7 +53,7 @@ public: } unsigned GetX86RegNum(const MCOperand &MO) const { - return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7; + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; } // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range @@ -77,6 +77,14 @@ public: return (~SrcRegNum) & 0xf; } + unsigned char getWriteMaskRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + assert(X86::K0 != MI.getOperand(OpNum).getReg() && + "Invalid mask register as write-mask!"); + unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum)); + return MaskRegNum; + } + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { OS << (char)C; ++CurByte; @@ -152,6 +160,52 @@ static bool isDisp8(int Value) { return Value == (signed char)Value; } +/// isCDisp8 - Return true if this signed displacement fits in a 8-bit +/// compressed dispacement field. +static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { + assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) && + "Compressed 8-bit displacement is only valid for EVEX inst."); + + unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask; + unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask; + + if (CD8V == 0 && CD8E == 0) { + CValue = Value; + return isDisp8(Value); + } + + unsigned MemObjSize = 1U << CD8E; + if (CD8V & 4) { + // Fixed vector length + MemObjSize *= 1U << (CD8V & 0x3); + } else { + // Modified vector length + bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B; + if (!EVEX_b) { + unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0; + EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0; + assert(EVEX_LL < 3 && ""); + + unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize; + NumElems /= 1U << (CD8V & 0x3); + + MemObjSize *= NumElems; + } + } + + unsigned MemObjMask = MemObjSize - 1; + assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size."); + + if (Value & MemObjMask) // Unaligned offset + return false; + Value /= MemObjSize; + bool Ret = (Value == (signed char)Value); + + if (Ret) + CValue = Value; + return Ret; +} + /// getImmFixupKind - Return the appropriate fixup kind to use for an immediate /// in an instruction with the specified TSFlags. static MCFixupKind getImmFixupKind(uint64_t TSFlags) { @@ -237,6 +291,14 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) { return GOT_Normal; } +static bool HasSecRelSymbolRef(const MCExpr *Expr) { + if (Expr->getKind() == MCExpr::SymbolRef) { + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + return Ref->getKind() == MCSymbolRefExpr::VK_SECREL; + } + return false; +} + void X86MCCodeEmitter:: EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, @@ -268,8 +330,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, if (Kind == GOT_Normal) ImmOffset = CurByte; } else if (Expr->getKind() == MCExpr::SymbolRef) { - const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); - if (Ref->getKind() == MCSymbolRefExpr::VK_SECREL) { + if (HasSecRelSymbolRef(Expr)) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } else if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr); + if (HasSecRelSymbolRef(Bin->getLHS()) + || HasSecRelSymbolRef(Bin->getRHS())) { FixupKind = MCFixupKind(FK_SecRel_4); } } @@ -305,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode @@ -365,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (Disp.isImm() && isDisp8(Disp.getImm())) { - EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); - return; + if (Disp.isImm()) { + if (!HasEVEX && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + // Try EVEX compressed 8-bit displacement first; if failed, fall back to + // 32-bit displacement. + int CDisp8 = 0; + if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + CDisp8 - Disp.getImm()); + return; + } } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] @@ -384,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, bool ForceDisp32 = false; bool ForceDisp8 = false; + int CDisp8 = 0; + int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. @@ -399,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); - } else if (isDisp8(Disp.getImm())) { + } else if (!HasEVEX && isDisp8(Disp.getImm())) { // Emit the disp8 encoding. EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); @@ -432,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Do we need to output a displacement? if (ForceDisp8) - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset); else if (ForceDisp32 || Disp.getImm() != 0) EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, Fixups); @@ -444,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; @@ -455,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0: Same as REX_R=1 (64 bit mode only) // unsigned char VEX_R = 0x1; + unsigned char EVEX_R2 = 0x1; // VEX_X: equivalent to REX.X, only used when a // register is used for index in SIB Byte. @@ -491,6 +580,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // VEX_4V (VEX vvvv field): a register specifier // (in 1's complement form) or 1111 if unused. unsigned char VEX_4V = 0xf; + unsigned char EVEX_V2 = 0x1; // VEX_L (Vector Length): // @@ -498,6 +588,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 1: 256-bit vector // unsigned char VEX_L = 0; + unsigned char EVEX_L2 = 0; // VEX_PP: opcode extension providing equivalent // functionality of a SIMD prefix @@ -509,6 +600,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned char VEX_PP = 0; + // EVEX_U + unsigned char EVEX_U = 1; // Always '1' so far + + // EVEX_z + unsigned char EVEX_z = 0; + + // EVEX_b + unsigned char EVEX_b = 0; + + // EVEX_aaa + unsigned char EVEX_aaa = 0; + // Encode the operand size opcode prefix as needed. if (TSFlags & X86II::OpSize) VEX_PP = 0x01; @@ -521,6 +624,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2)) + EVEX_L2 = 1; + + if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z)) + EVEX_z = 1; + + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B)) + EVEX_b = 1; switch (TSFlags & X86II::Op0Mask) { default: llvm_unreachable("Invalid prefix!"); @@ -567,12 +678,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb CurOp += 2; - } + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; switch (TSFlags & X86II::FormMask) { case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); @@ -582,18 +700,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MemAddr, src1(VEX_4V), src2(ModR/M) // MemAddr, src1(ModR/M), imm8 // - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrBaseReg).getReg())) VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; + + CurOp += X86::AddrNumOperands; - CurOp = X86::AddrNumOperands; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } const MCOperand &MO = MI.getOperand(CurOp); - if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) - VEX_R = 0x0; + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MO.getReg())) + EVEX_R2 = 0x0; + } break; } case X86II::MRMSrcMem: @@ -606,11 +741,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // FMA4: // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); - if (HasVEX_4V) + if (HasVEX_4V) { VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -618,6 +763,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; if (HasVEX_4VOp3) // Instruction format for 4VOp3: @@ -634,8 +782,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MRM[0-9]m instructions forms: // MemAddr // src1(VEX_4V), MemAddr - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 0); + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + } + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -656,16 +811,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (HasMemOp4) // Skip second register source (encoded in I8IMM) CurOp++; if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; if (HasVEX_4VOp3) VEX_4V = getVEXRegisterEncoding(MI, CurOp); @@ -677,13 +843,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M), src1(VEX_4V), src2(ModR/M) if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; break; case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -691,9 +868,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::MRM6r: case X86II::MRM7r: // MRM0r-MRM7r instructions forms: // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, 0); - if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; break; default: // RawFrm break; @@ -702,29 +888,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // Emit segment override opcode prefix as needed. EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); - // VEX opcode prefix can have 2 or 3 bytes - // - // 3 bytes: - // +-----+ +--------------+ +-------------------+ - // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - // 2 bytes: - // +-----+ +-------------------+ - // | C5h | | R | vvvv | L | pp | - // +-----+ +-------------------+ - // - unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + if (!HasEVEX) { + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix - EmitByte(0xC5, CurByte, OS); - EmitByte(LastByte | (VEX_R << 7), CurByte, OS); - return; - } + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } - // 3 byte VEX prefix - EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); - EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + // 3 byte VEX prefix + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + } else { + // EVEX opcode prefix can have 4 bytes + // + // +-----+ +--------------+ +-------------------+ +------------------------+ + // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | + // +-----+ +--------------+ +-------------------+ +------------------------+ + assert((VEX_5M & 0x3) == VEX_5M + && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + + VEX_5M &= 0x3; + + EmitByte(0x62, CurByte, OS); + EmitByte((VEX_R << 7) | + (VEX_X << 6) | + (VEX_B << 5) | + (EVEX_R2 << 4) | + VEX_5M, CurByte, OS); + EmitByte((VEX_W << 7) | + (VEX_4V << 3) | + (EVEX_U << 2) | + VEX_PP, CurByte, OS); + EmitByte((EVEX_z << 7) | + (EVEX_L2 << 6) | + (VEX_L << 5) | + (EVEX_b << 4) | + (EVEX_V2 << 3) | + EVEX_aaa, CurByte, OS); + } } /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 @@ -979,18 +1194,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, if ((TSFlags & X86II::FormMask) == X86II::Pseudo) return; - // If this is a two-address instruction, skip one of the register operands. - // FIXME: This should be handled during MCInst lowering. unsigned NumOps = Desc.getNumOperands(); - unsigned CurOp = 0; - if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); - // Special case for GATHER with 2 TIED_TO operands - // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - } + unsigned CurOp = X86II::getOperandBias(Desc); // Keep track of the current byte being emitted. unsigned CurByte = 0; @@ -1004,6 +1209,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; const unsigned MemOp4_I8IMMOperand = 2; + // It uses the EVEX.aaa field? + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); + // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; @@ -1054,6 +1263,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1066,6 +1278,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + X86::AddrNumOperands; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1079,6 +1294,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1097,6 +1315,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRMSrcMem: { int AddrOperands = X86::AddrNumOperands; unsigned FirstMemOp = CurOp+1; + + if (HasEVEX_K) { // Skip writemask + ++AddrOperands; + ++FirstMemOp; + } + if (HasVEX_4V) { ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). @@ -1136,17 +1360,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, TSFlags, CurByte, OS, Fixups); CurOp += X86::AddrNumOperands; break; - case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: - case X86II::MRM_C8: case X86II::MRM_C9: - case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: - case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: - case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E8: case X86II::MRM_F0: - case X86II::MRM_F8: case X86II::MRM_F9: + case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: + case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_D0: + case X86II::MRM_D1: case X86II::MRM_D4: case X86II::MRM_D5: + case X86II::MRM_D6: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8: + case X86II::MRM_F9: EmitByte(BaseOpcode, CurByte, OS); unsigned char MRM; @@ -1158,10 +1380,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C4: MRM = 0xC4; break; case X86II::MRM_C8: MRM = 0xC8; break; case X86II::MRM_C9: MRM = 0xC9; break; + case X86II::MRM_CA: MRM = 0xCA; break; + case X86II::MRM_CB: MRM = 0xCB; break; case X86II::MRM_D0: MRM = 0xD0; break; case X86II::MRM_D1: MRM = 0xD1; break; case X86II::MRM_D4: MRM = 0xD4; break; case X86II::MRM_D5: MRM = 0xD5; break; + case X86II::MRM_D6: MRM = 0xD6; break; case X86II::MRM_D8: MRM = 0xD8; break; case X86II::MRM_D9: MRM = 0xD9; break; case X86II::MRM_DA: MRM = 0xDA; break; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5e84530..bd23ce4 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -263,7 +263,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { return X; } -static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { +static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { Triple TheTriple(TT); bool is64Bit = TheTriple.getArch() == Triple::x86_64; @@ -290,14 +290,16 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { int stackGrowth = is64Bit ? -8 : -4; // Initial state of the frame pointer is esp+stackGrowth. - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth); - MAI->addInitialFrameState(0, Dst, Src); + unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP; + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa( + 0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); + MAI->addInitialFrameState(Inst); // Add return address to move list - MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth); - MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP); - MAI->addInitialFrameState(0, CSDst, CSSrc); + unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP; + MCCFIInstruction Inst2 = MCCFIInstruction::createOffset( + 0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth); + MAI->addInitialFrameState(Inst2); return MAI; } @@ -382,6 +384,17 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T, return 0; } +static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT, + MCContext &Ctx) { + Triple TheTriple(TT); + if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64) + return createX86_64MachORelocationInfo(Ctx); + else if (TheTriple.isOSBinFormatELF()) + return createX86_64ELFRelocationInfo(Ctx); + // Default to the stock relocation info. + return llvm::createMCRelocationInfo(TT, Ctx); +} + static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { return new MCInstrAnalysis(Info); } @@ -439,4 +452,10 @@ extern "C" void LLVMInitializeX86TargetMC() { createX86MCInstPrinter); TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target, + createX86MCRelocationInfo); + TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target, + createX86MCRelocationInfo); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 981aa1a..2f459b4 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -25,6 +25,7 @@ class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; +class MCRelocationInfo; class Target; class StringRef; class raw_ostream; @@ -94,6 +95,12 @@ MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, uint16_t EMachine); /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); + +/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info. +MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); + +/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info. +MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); } // End llvm namespace diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp new file mode 100644 index 0000000..75b5acf --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -0,0 +1,116 @@ +//===-- X86MachORelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/Object/MachO.h" + +using namespace llvm; +using namespace object; +using namespace macho; + +namespace { +class X86_64MachORelocationInfo : public MCRelocationInfo { +public: + X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) { + const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile()); + + uint64_t RelType; Rel.getType(RelType); + symbol_iterator SymI = Rel.getSymbol(); + + StringRef SymName; SymI->getName(SymName); + uint64_t SymAddr; SymI->getAddress(SymAddr); + + RelocationEntry RE = Obj->getRelocation(Rel.getRawDataRefImpl()); + bool isPCRel = Obj->getAnyRelocationPCRel(RE); + + MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (Sym->isVariable() == false) + Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); + const MCExpr *Expr = 0; + + switch(RelType) { + case RIT_X86_64_TLV: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + break; + case RIT_X86_64_Signed4: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(4, Ctx), + Ctx); + break; + case RIT_X86_64_Signed2: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(2, Ctx), + Ctx); + break; + case RIT_X86_64_Signed1: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(1, Ctx), + Ctx); + break; + case RIT_X86_64_GOTLoad: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case RIT_X86_64_GOT: + Expr = MCSymbolRefExpr::Create(Sym, isPCRel ? + MCSymbolRefExpr::VK_GOTPCREL : + MCSymbolRefExpr::VK_GOT, + Ctx); + break; + case RIT_X86_64_Subtractor: + { + RelocationRef RelNext; + Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext); + RelocationEntry RENext = Obj->getRelocation(RelNext.getRawDataRefImpl()); + + // X86_64_SUBTRACTOR must be followed by a relocation of type + // X86_64_RELOC_UNSIGNED . + // NOTE: Scattered relocations don't exist on x86_64. + unsigned RType = Obj->getAnyRelocationType(RENext); + if (RType != RIT_X86_64_Unsigned) + report_fatal_error("Expected X86_64_RELOC_UNSIGNED after " + "X86_64_RELOC_SUBTRACTOR."); + + const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx); + + symbol_iterator RSymI = RelNext.getSymbol(); + uint64_t RSymAddr; + RSymI->getAddress(RSymAddr); + StringRef RSymName; + RSymI->getName(RSymName); + + MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName); + if (RSym->isVariable() == false) + RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx)); + + const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx); + + Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx); + break; + } + default: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + } + return Expr; + } +}; +} // End unnamed namespace + +/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) { + return new X86_64MachORelocationInfo(Ctx); +} diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index bc272ef..ed64a32 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -9,6 +9,8 @@ #include "MCTargetDesc/X86FixupKinds.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/COFF.h" #include "llvm/Support/ErrorHandling.h" @@ -27,7 +29,9 @@ namespace { X86WinCOFFObjectWriter(bool Is64Bit_); ~X86WinCOFFObjectWriter(); - virtual unsigned getRelocType(unsigned FixupKind) const; + virtual unsigned getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection) const LLVM_OVERRIDE; }; } @@ -38,7 +42,14 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_) X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} -unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const { +unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection) const { + unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); + + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + switch (FixupKind) { case FK_PCRel_4: case X86::reloc_riprel_4byte: @@ -46,6 +57,9 @@ unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const { return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32; case FK_Data_4: case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB : + COFF::IMAGE_REL_I386_DIR32NB; return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32; case FK_Data_8: if (Is64Bit) diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 496b704..adfa7fa 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -517,37 +517,6 @@ to <2 x i64> ops being so bad. //===---------------------------------------------------------------------===// -'select' on vectors and scalars could be a whole lot better. We currently -lower them to conditional branches. On x86-64 for example, we compile this: - -double test(double a, double b, double c, double d) { return a<b ? c : d; } - -to: - -_test: - ucomisd %xmm0, %xmm1 - ja LBB1_2 # entry -LBB1_1: # entry - movapd %xmm3, %xmm2 -LBB1_2: # entry - movapd %xmm2, %xmm0 - ret - -instead of: - -_test: - cmpltsd %xmm1, %xmm0 - andpd %xmm0, %xmm2 - andnpd %xmm3, %xmm0 - orpd %xmm2, %xmm0 - ret - -For unpredictable branches, the later is much more efficient. This should -just be a matter of having scalar sse map to SELECT_CC and custom expanding -or iseling it. - -//===---------------------------------------------------------------------===// - LLVM currently generates stack realignment code, when it is not necessary needed. The problem is that we need to know about stack alignment too early, before RA runs. diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 1f9919f..947002f 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -69,6 +69,11 @@ ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); +/// createX86FixupLEAs - Return a a pass that selectively replaces +/// certain instructions (like add, sub, inc, dec, some shifts, +/// and some multiplies) by equivalent LEA instructions, in order +/// to eliminate execution delays in some Atom processors. +FunctionPass *createX86FixupLEAs(); } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 0216252..461ea9b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -86,6 +86,19 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512", + "Enable AVX-512 instructions", + [FeatureAVX2]>; +def FeatureERI : SubtargetFeature<"avx-512-eri", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx-512-cdi", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx-512-pfi", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; + def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -120,8 +133,14 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", "Support BMI2 instructions">; def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", "Support RTM instructions">; +def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true", + "Support HLE">; def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", "Support ADX instructions">; +def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", + "Support PRFCHW instructions">; +def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", + "Support RDSEED instruction">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", @@ -130,6 +149,11 @@ def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; +def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", + "CallRegIndirect", "true", + "Call register indirect">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -143,9 +167,6 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -class AtomProc<string Name, list<SubtargetFeature> Features> - : ProcessorModel<Name, AtomModel, Features>; - def : Proc<"generic", []>; def : Proc<"i386", []>; def : Proc<"i486", []>; @@ -162,46 +183,71 @@ def : Proc<"pentium4", [FeatureSSE2]>; def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, FeatureFastUAMem]>; -def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, - FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, - FeatureSlowDivide, FeaturePadShortFunctions]>; +// Intel Core Duo. +def : ProcessorModel<"yonah", SandyBridgeModel, + [FeatureSSE3, FeatureSlowBTMem]>; + +// NetBurst. +def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + +// Intel Core 2 Solo/Duo. +def : ProcessorModel<"core2", SandyBridgeModel, + [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : ProcessorModel<"penryn", SandyBridgeModel, + [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + +// Atom. +def : ProcessorModel<"atom", AtomModel, + [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, + FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, + FeatureSlowDivide, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions]>; + // "Arrandale" along with corei3 and corei5 -def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES]>; -def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem, - FeaturePOPCNT]>; +def : ProcessorModel<"corei7", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>; + +def : ProcessorModel<"nehalem", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; +def : ProcessorModel<"westmere", SandyBridgeModel, + [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureFastUAMem, FeaturePOPCNT, FeatureAES, + FeaturePCLMUL]>; // Sandy Bridge // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. -def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; +def : ProcessorModel<"corei7-avx", SandyBridgeModel, + [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; // Ivy Bridge -def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, - FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>; +def : ProcessorModel<"core-avx-i", SandyBridgeModel, + [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase]>; // Haswell -def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, - FeatureRDRAND, FeatureF16C, FeatureFSGSBase, - FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, - FeatureRTM]>; +def : ProcessorModel<"core-avx2", HaswellModel, + [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, + FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, + FeatureHLE]>; + +// KNL +// FIXME: define KNL model +def : ProcessorModel<"knl", HaswellModel, + [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -231,11 +277,16 @@ def : Proc<"amdfam10", [FeatureSSE4A, // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT]>; +// Jaguar +def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, FeatureBMI, + FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeaturePOPCNT]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePCLMUL, FeatureLZCNT, FeaturePOPCNT]>; -// Enhanced Bulldozer +// Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, @@ -279,6 +330,9 @@ def ATTAsmParser : AsmParser { def ATTAsmParserVariant : AsmParserVariant { int Variant = 0; + // Variant name. + string Name = "att"; + // Discard comments in assembly strings. string CommentDelimiter = "#"; @@ -289,6 +343,9 @@ def ATTAsmParserVariant : AsmParserVariant { def IntelAsmParserVariant : AsmParserVariant { int Variant = 1; + // Variant name. + string Name = "intel"; + // Discard comments in assembly strings. string CommentDelimiter = ";"; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index ac5daec..9e0ab82 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -201,7 +201,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, case X86II::MO_TLVP_PIC_BASE: O << "@TLVP" << '-' << *MF->getPICBaseSymbol(); break; - case X86II::MO_SECREL: O << "@SECREL"; break; + case X86II::MO_SECREL: O << "@SECREL32"; break; } } @@ -702,48 +702,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } -MachineLocation -X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - - if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - -void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &O) { - // Only the target-dependent form of DBG_VALUE should get here. - // Referencing the offset and metadata as NOps-2 and NOps-1 is - // probably portable to other targets; frame pointer location is not. - unsigned NOps = MI->getNumOperands(); - assert(NOps==7); - O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - if (V.getContext().isSubprogram()) - O << DISubprogram(V.getContext()).getDisplayName() << ":"; - O << V.getName(); - O << " <- "; - // Frame address. Currently handles register +- offset only. - O << '['; - if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) - printOperand(MI, 0, O); - else - O << "undef"; - O << '+'; printOperand(MI, 3, O); - O << ']'; - O << "+"; - printOperand(MI, NOps-2, O); -} - - - //===----------------------------------------------------------------------===// // Target Registry Stuff //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index bc7496b..6eed5ce 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -67,11 +67,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { unsigned AsmVariant = 1); virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE; - - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - - virtual MachineLocation - getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; }; } // end namespace llvm diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index b516be0..38e2591 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, @@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // i32, i64 in the standard way CCDelegateTo<RetCC_X86Common> ]>; @@ -156,6 +166,11 @@ def RetCC_X86_32 : CallingConv<[ def RetCC_X86_64 : CallingConv<[ // HiPE uses RetCC_X86_64_HiPE CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + + // Handle explicit CC selection + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -208,10 +223,15 @@ def CC_X86_64_C : CallingConv<[ // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>>, + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, + // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, @@ -225,7 +245,11 @@ def CC_X86_64_C : CallingConv<[ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>> + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; // Calling convention used on Win64 @@ -246,6 +270,9 @@ def CC_X86_Win64_C : CallingConv<[ // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType<i64>>, @@ -340,7 +367,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 4 AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. @@ -387,8 +414,8 @@ def CC_X86_32_ThisCall : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType<i32>>, - // Pass sret arguments indirectly through EAX - CCIfSRet<CCAssignToReg<[EAX]>>, + // Pass sret arguments indirectly through stack. + CCIfSRet<CCAssignToStack<4, 4>>, // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, @@ -464,6 +491,10 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, CCDelegateTo<CC_X86_32_C> @@ -489,6 +520,8 @@ def CC_X86_32 : CallingConv<[ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -528,6 +561,10 @@ def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "YMM%u", 6, 15))>; +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; //Standard C + XMM 8-15 def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, (sequence "XMM%u", 8, 15))>; @@ -535,3 +572,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, //Standard C + YMM 8-15 def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index 2518e02..5d72b44 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -53,13 +53,8 @@ namespace { static char ID; explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) : MachineFunctionPass(ID), II(0), TD(0), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(false), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} - Emitter(X86TargetMachine &tm, CodeEmitter &mce, - const X86InstrInfo &ii, const DataLayout &td, bool is64) - : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(is64), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + MCE(mce), PICBaseOffset(0), Is64BitMode(false), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} bool runOnMachineFunction(MachineFunction &MF); @@ -1270,7 +1265,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64ri64i32) + if (Opcode == X86::MOV32ri64) rt = X86::reloc_absolute_word; // FIXME: add X86II flag? // This should not occur on Darwin for relocatable objects. if (Opcode == X86::MOV64ri) @@ -1451,6 +1446,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, MCE.emitByte(BaseOpcode); MCE.emitByte(0xC9); break; + case X86II::MRM_CA: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xCA); + break; + case X86II::MRM_CB: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xCB); + break; case X86II::MRM_E8: MCE.emitByte(BaseOpcode); MCE.emitByte(0xE8); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 85155f5..5bc3420 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -45,10 +45,6 @@ class X86FastISel : public FastISel { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// RegInfo - X86 register info. - /// - const X86RegisterInfo *RegInfo; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. @@ -63,17 +59,16 @@ public: Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); } virtual bool TargetSelectInstruction(const Instruction *I); - /// TryToFoldLoad - The specified machine instr operand is a vreg, and that + /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if /// possible. - virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo, - const LoadInst *LI); + virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI); virtual bool FastLowerArguments(); @@ -84,8 +79,10 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, + bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, + bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -107,6 +104,8 @@ private: bool X86SelectShift(const Instruction *I); + bool X86SelectDivRem(const Instruction *I); + bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); @@ -236,7 +235,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, + const X86AddressMode &AM, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -246,8 +246,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); - Val = AndResult; + TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; @@ -263,26 +263,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; break; case MVT::v4f32: - Opc = X86::MOVAPSmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - Opc = X86::MOVAPDmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - Opc = X86::MOVDQAmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DL, TII.get(Opc)), AM).addReg(Val); + DL, TII.get(Opc)), AM).addReg(ValReg); return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM) { + const X86AddressMode &AM, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); @@ -317,7 +326,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM); + return X86FastEmitStore(VT, ValReg, AM, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -693,8 +702,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { unsigned SABIAlignment = TD.getABITypeAlignment(S->getValueOperand()->getType()); - if (S->getAlignment() != 0 && S->getAlignment() < SABIAlignment) - return false; + bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; MVT VT; if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) @@ -704,7 +712,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (!X86SelectAddress(I->getOperand(1), AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM); + return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -720,10 +728,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall) + CC != CallingConv::X86_FastCall && + CC != CallingConv::X86_64_SysV) return false; - if (Subtarget->isTargetWin64()) + if (Subtarget->isCallingConvWin64(CC)) return false; // Don't handle popping bytes on return for now. @@ -816,14 +825,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // The x86-64 ABI for returning structs by value requires that we copy // the sret argument into %rax for the return. We saved the argument into // a virtual register in the entry block, so now we copy the value out - // and into %rax. - if (Subtarget->is64Bit() && F.hasStructRetAttr()) { + // and into %rax. We also do the same with %eax for Win32. + if (F.hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); + unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - X86::RAX).addReg(Reg); - RetRegs.push_back(X86::RAX); + RetReg).addReg(Reg); + RetRegs.push_back(RetReg); } // Now emit the RET. @@ -1006,10 +1017,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } bool X86FastISel::X86SelectZExt(const Instruction *I) { - // Handle zero-extension from i1 to i8, which is common. - if (!I->getOperand(0)->getType()->isIntegerTy(1)) - return false; - EVT DstVT = TLI.getValueType(I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; @@ -1018,12 +1025,37 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { if (ResultReg == 0) return false; - // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); - if (ResultReg == 0) - return false; + // Handle zero-extension from i1 to i8, which is common. + MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()).getSimpleVT(); + if (SrcVT.SimpleTy == MVT::i1) { + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + + if (ResultReg == 0) + return false; + } + + if (DstVT == MVT::i64) { + // Handle extension to 64-bits via sub-register shenanigans. + unsigned MovInst; + + switch (SrcVT.SimpleTy) { + case MVT::i8: MovInst = X86::MOVZX32rr8; break; + case MVT::i16: MovInst = X86::MOVZX32rr16; break; + case MVT::i32: MovInst = X86::MOV32rr; break; + default: llvm_unreachable("Unexpected zext to i64 source type"); + } + + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32) + .addReg(ResultReg); - if (DstVT != MVT::i8) { + ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG), + ResultReg) + .addImm(0).addReg(Result32).addImm(X86::sub_32bit); + } else if (DstVT != MVT::i8) { ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) @@ -1233,6 +1265,170 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { return true; } +bool X86FastISel::X86SelectDivRem(const Instruction *I) { + const static unsigned NumTypes = 4; // i8, i16, i32, i64 + const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem + const static bool S = true; // IsSigned + const static bool U = false; // !IsSigned + const static unsigned Copy = TargetOpcode::COPY; + // For the X86 DIV/IDIV instruction, in most cases the dividend + // (numerator) must be in a specific register pair highreg:lowreg, + // producing the quotient in lowreg and the remainder in highreg. + // For most data types, to set up the instruction, the dividend is + // copied into lowreg, and lowreg is sign-extended or zero-extended + // into highreg. The exception is i8, where the dividend is defined + // as a single register rather than a register pair, and we + // therefore directly sign-extend or zero-extend the dividend into + // lowreg, instead of copying, and ignore the highreg. + const static struct DivRemEntry { + // The following portion depends only on the data type. + const TargetRegisterClass *RC; + unsigned LowInReg; // low part of the register pair + unsigned HighInReg; // high part of the register pair + // The following portion depends on both the data type and the operation. + struct DivRemResult { + unsigned OpDivRem; // The specific DIV/IDIV opcode to use. + unsigned OpSignExtend; // Opcode for sign-extending lowreg into + // highreg, or copying a zero into highreg. + unsigned OpCopy; // Opcode for copying dividend into lowreg, or + // zero/sign-extending into lowreg for i8. + unsigned DivRemResultReg; // Register containing the desired result. + bool IsOpSigned; // Whether to use signed or unsigned form. + } ResultTable[NumOps]; + } OpTable[NumTypes] = { + { &X86::GR8RegClass, X86::AX, 0, { + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem + } + }, // i8 + { &X86::GR16RegClass, X86::AX, X86::DX, { + { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv + { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem + { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv + { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem + } + }, // i16 + { &X86::GR32RegClass, X86::EAX, X86::EDX, { + { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv + { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem + { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv + { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem + } + }, // i32 + { &X86::GR64RegClass, X86::RAX, X86::RDX, { + { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv + { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem + { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem + } + }, // i64 + }; + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned TypeIndex, OpIndex; + switch (VT.SimpleTy) { + default: return false; + case MVT::i8: TypeIndex = 0; break; + case MVT::i16: TypeIndex = 1; break; + case MVT::i32: TypeIndex = 2; break; + case MVT::i64: TypeIndex = 3; + if (!Subtarget->is64Bit()) + return false; + break; + } + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected div/rem opcode"); + case Instruction::SDiv: OpIndex = 0; break; + case Instruction::SRem: OpIndex = 1; break; + case Instruction::UDiv: OpIndex = 2; break; + case Instruction::URem: OpIndex = 3; break; + } + + const DivRemEntry &TypeEntry = OpTable[TypeIndex]; + const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) + return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) + return false; + + // Move op0 into low-order input register. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); + // Zero-extend or sign-extend into high-order input register. + if (OpEntry.OpSignExtend) { + if (OpEntry.IsOpSigned) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpSignExtend)); + else { + unsigned Zero32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::MOV32r0), Zero32); + + // Copy the zero into the appropriate sub/super/identical physical + // register. Unfortunately the operations needed are not uniform enough to + // fit neatly into the table above. + if (VT.SimpleTy == MVT::i16) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32, 0, X86::sub_16bit); + } else if (VT.SimpleTy == MVT::i32) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32); + } else if (VT.SimpleTy == MVT::i64) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) + .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); + } + } + } + // Generate the DIV/IDIV instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); + // For i8 remainder, we can't reference AH directly, as we'll end + // up with bogus copies like %R9B = COPY %AH. Reference AX + // instead to prevent AH references in a REX instruction. + // + // The current assumption of the fast register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + unsigned ResultReg = 0; + if ((I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem) && + OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { + unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); + unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), SourceSuperReg).addReg(X86::AX); + + // Shift AX right by 8 bits instead of using AH. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri), + ResultSuperReg).addReg(SourceSuperReg).addImm(8); + + // Now reference the 8-bit subreg of the result. + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + /*Kill=*/true, X86::sub_8bit); + } + // Copy the result out of the physreg if we haven't already. + if (!ResultReg) { + ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg) + .addReg(OpEntry.DivRemResultReg); + } + UpdateValueMap(I, ResultReg); + + return true; +} + bool X86FastISel::X86SelectSelect(const Instruction *I) { MVT VT; if (!isTypeLegal(I->getType(), VT)) @@ -1526,9 +1722,6 @@ bool X86FastISel::FastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; - if (Subtarget->isTargetWindows()) - return false; - const Function *F = FuncInfo.Fn; if (F->isVarArg()) return false; @@ -1536,7 +1729,10 @@ bool X86FastISel::FastLowerArguments() { CallingConv::ID CC = F->getCallingConv(); if (CC != CallingConv::C) return false; - + + if (Subtarget->isCallingConvWin64(CC)) + return false; + if (!Subtarget->is64Bit()) return false; @@ -1580,8 +1776,6 @@ bool X86FastISel::FastLowerArguments() { const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++Idx) { - if (I->use_empty()) - continue; bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; @@ -1640,8 +1834,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); + bool isWin64 = Subtarget->isCallingConvWin64(CC); if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall) + CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 && + CC != CallingConv::X86_64_SysV) return false; // fastcc with -tailcallopt is intended to provide a guaranteed @@ -1655,7 +1851,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Don't know how to handle Win64 varargs yet. Nothing special needed for // x86-32. Special handling for x86-64 is implemented. - if (isVarArg && Subtarget->isTargetWin64()) + if (isVarArg && isWin64) return false; // Fast-isel doesn't know about callee-pop yet. @@ -1785,7 +1981,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { I->getParent()->getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) + if (isWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -1868,6 +2064,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { } else { unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>( + getTargetMachine()->getRegisterInfo()); AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; @@ -1899,7 +2097,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { X86::EBX).addReg(Base); } - if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + if (Subtarget->is64Bit() && isVarArg && !isWin64) { // Count the number of XMM registers allocated. static const uint16_t XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, @@ -1968,7 +2166,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); - if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + if (Subtarget->is64Bit() && isVarArg && !isWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. @@ -2082,6 +2280,11 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { case Instruction::AShr: case Instruction::Shl: return X86SelectShift(I); + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return X86SelectDivRem(I); case Instruction::Select: return X86SelectSelect(I); case Instruction::Trunc: @@ -2273,12 +2476,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { } -/// TryToFoldLoad - The specified machine instr operand is a vreg, and that -/// vreg is being provided by the specified load instruction. If possible, -/// try to fold the load as an operand to the instruction, returning true if -/// possible. -bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, - const LoadInst *LI) { +bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { X86AddressMode AM; if (!X86SelectAddress(LI->getOperand(0), AM)) return false; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp new file mode 100644 index 0000000..d21cb8a --- /dev/null +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -0,0 +1,253 @@ +//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will find instructions which +// can be re-written as LEA instructions in order to reduce pipeline +// delays for some models of the Intel Atom family. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-fixup-LEAs" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumLEAs, "Number of LEA instructions created"); + +namespace { + class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + /// \brief Loop over all of the instructions in the basic block + /// replacing applicable instructions with LEA instructions, + /// where appropriate. + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + + virtual const char *getPassName() const { return "X86 Atom LEA Fixup";} + + /// \brief Given a machine register, look for the instruction + /// which writes it in the current basic block. If found, + /// try to replace it with an equivalent LEA instruction. + /// If replacement succeeds, then also process the the newly created + /// instruction. + void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief Given a memory access or LEA instruction + /// whose address mode uses a base and/or index register, look for + /// an opportunity to replace the instruction which sets the base or index + /// register with an equivalent LEA instruction. + void processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief Determine if an instruction references a machine register + /// and, if so, whether it reads or writes the register. + RegUsageState usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I); + + /// \brief Step backwards through a basic block, looking + /// for an instruction which writes a register within + /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. + MachineBasicBlock::iterator searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI); + + /// \brief if an instruction can be converted to an + /// equivalent LEA, insert the new instruction into the basic block + /// and return a pointer to it. Otherwise, return zero. + MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const; + + public: + FixupLEAPass() : MachineFunctionPass(ID) {} + + /// \brief Loop over all of the basic blocks, + /// replacing instructions by equivalent LEA instructions + /// if needed and when possible. + virtual bool runOnMachineFunction(MachineFunction &MF); + + private: + MachineFunction *MF; + const TargetMachine *TM; + const TargetInstrInfo *TII; // Machine instruction info. + + }; + char FixupLEAPass::ID = 0; +} + +MachineInstr * +FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const { + MachineInstr* MI = MBBI; + MachineInstr* NewMI; + switch (MI->getOpcode()) { + case X86::MOV32rr: + case X86::MOV64rr: { + const MachineOperand& Src = MI->getOperand(1); + const MachineOperand& Dest = MI->getOperand(0); + NewMI = BuildMI(*MF, MI->getDebugLoc(), + TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (!MI->getOperand(2).isImm()) { + // convertToThreeAddress will call getImm() + // which requires isImm() to be true + return 0; + } + } + return TII->convertToThreeAddress(MFI, MBBI, 0); +} + +FunctionPass *llvm::createX86FixupLEAs() { + return new FixupLEAPass(); +} + +bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + TM = &MF->getTarget(); + TII = TM->getInstrInfo(); + + DEBUG(dbgs() << "Start X86FixupLEAs\n";); + // Process all basic blocks. + for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) + processBasicBlock(Func, I); + DEBUG(dbgs() << "End X86FixupLEAs\n";); + + return true; +} + +FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, + MachineBasicBlock::iterator I) { + RegUsageState RegUsage = RU_NotUsed; + MachineInstr* MI = I; + + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand& opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()){ + if (opnd.isDef()) + return RU_Write; + RegUsage = RU_Read; + } + } + return RegUsage; +} + +/// getPreviousInstr - Given a reference to an instruction in a basic +/// block, return a reference to the previous instruction in the block, +/// wrapping around to the last instruction of the block if the block +/// branches to itself. +static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + if (I == MFI->begin()) { + if (MFI->isPredecessor(MFI)) { + I = --MFI->end(); + return true; + } + else + return false; + } + --I; + return true; +} + +MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst; + static const int INSTR_DISTANCE_THRESHOLD = 5; + + CurInst = I; + bool Found; + Found = getPreviousInstr(CurInst, MFI); + while( Found && I != CurInst) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > INSTR_DISTANCE_THRESHOLD) + break; // too far back to make a difference + if (usesRegister(p, CurInst) == RU_Write){ + return CurInst; + } + InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + Found = getPreviousInstr(CurInst, MFI); + } + return 0; +} + +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + // Process a load, store, or LEA instruction. + MachineInstr *MI = I; + int opcode = MI->getOpcode(); + const MCInstrDesc& Desc = MI->getDesc(); + int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); + if (AddrOffset >= 0) { + AddrOffset += X86II::getOperandBias(Desc); + MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + if (p.isReg() && p.getReg() != X86::ESP) { + seekLEAFixup(p, I, MFI); + } + MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + if (q.isReg() && q.getReg() != X86::ESP) { + seekLEAFixup(q, I, MFI); + } + } +} + +void FixupLEAPass::seekLEAFixup(MachineOperand& p, + MachineBasicBlock::iterator& I, + MachineFunction::iterator MFI) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + if (MBI) { + MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI); + if (NewMI) { + ++NumLEAs; + DEBUG(dbgs() << "Candidate to replace:"; MBI->dump();); + // now to replace with an equivalent LEA... + DEBUG(dbgs() << "Replaced by: "; NewMI->dump();); + MFI->erase(MBI); + MachineBasicBlock::iterator J = + static_cast<MachineBasicBlock::iterator> (NewMI); + processInstruction(J, MFI); + } + } +} + +bool FixupLEAPass::processBasicBlock(MachineFunction &MF, + MachineFunction::iterator MFI) { + + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) + processInstruction(I, MFI); + return false; +} diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 0585b43..48470da 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -115,9 +115,10 @@ namespace { unsigned Mask = 0; for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I - X86::FP0; - if (Reg < 8) - Mask |= 1 << Reg; + unsigned Reg = *I; + if (Reg < X86::FP0 || Reg > X86::FP6) + continue; + Mask |= 1 << (Reg - X86::FP0); } return Mask; } @@ -893,8 +894,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Produce implicit-defs for free by using killed registers. while (Kills && Defs) { - unsigned KReg = CountTrailingZeros_32(Kills); - unsigned DReg = CountTrailingZeros_32(Defs); + unsigned KReg = countTrailingZeros(Kills); + unsigned DReg = countTrailingZeros(Defs); DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); std::swap(RegMap[KReg], RegMap[DReg]); @@ -917,7 +918,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Manually kill the rest. while (Kills) { - unsigned KReg = CountTrailingZeros_32(Kills); + unsigned KReg = countTrailingZeros(Kills); DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); freeStackSlotBefore(I, KReg); Kills &= ~(1 << KReg); @@ -925,7 +926,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Load zeros for all the imp-defs. while(Defs) { - unsigned DReg = CountTrailingZeros_32(Defs); + unsigned DReg = countTrailingZeros(Defs); DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); pushReg(DReg); @@ -1636,7 +1637,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // Note: this might be a non-optimal pop sequence. We might be able to do // better by trying to pop in stack order or something. while (FPKills) { - unsigned FPReg = CountTrailingZeros_32(FPKills); + unsigned FPReg = countTrailingZeros(FPKills); if (isLive(FPReg)) freeStackSlotAfter(InsertPt, FPReg); FPKills &= ~(1U << FPReg); @@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) .addExternalSymbol("_ftol2") .addReg(X86::ST0, RegState::ImplicitKill) + .addReg(X86::ECX, RegState::ImplicitDefine) .addReg(X86::EAX, RegState::Define | RegState::Implicit) .addReg(X86::EDX, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 54cbd40..b994e67 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -307,12 +307,12 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, unsigned FramePtr) const { MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); bool HasFP = hasFP(MF); @@ -360,16 +360,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, if (HasFP && FramePtr == Reg) continue; - MachineLocation CSDst(MachineLocation::VirtualFP, Offset); - MachineLocation CSSrc(Reg); - Moves.push_back(MachineMove(Label, CSDst, CSSrc)); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset)); } } /// getCompactUnwindRegNum - Get the compact unwind number for a given /// register. The number corresponds to the enum lists in /// compact_unwind_encoding.h. -static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) { +static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) { + static const uint16_t CU32BitRegs[] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const uint16_t CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs; for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -398,16 +404,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], // 4 3 // 5 3 // - static const uint16_t CU32BitRegs[] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const uint16_t CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { - int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); + int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; } @@ -466,14 +464,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], bool Is64Bit) { - static const uint16_t CU32BitRegs[] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const uint16_t CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - // Encode the registers in the order they were saved, 3-bits per register. The // registers are numbered from 1 to CU_NUM_SAVED_REGS. uint32_t RegEnc = 0; @@ -481,7 +471,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], unsigned Reg = SavedRegs[I]; if (Reg == 0) continue; - int CURegNum = getCompactUnwindRegNum(CURegs, Reg); + int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit); if (CURegNum == -1) return ~0U; // Encode the 3-bit register number in order, skipping over 3-bits for each @@ -528,11 +518,17 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (!MI.getFlag(MachineInstr::FrameSetup)) break; // We don't exect any more prolog instructions. - if (ExpectEnd) return 0; + if (ExpectEnd) return CU::UNWIND_MODE_DWARF; if (Opc == PushInstr) { // If there are too many saved registers, we cannot use compact encoding. - if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0; + if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF; + + unsigned Reg = MI.getOperand(0).getReg(); + if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) { + ExpectEnd = true; + continue; + } SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); StackAdjust += OffsetSize; @@ -542,7 +538,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { unsigned DstReg = MI.getOperand(0).getReg(); if (DstReg != FramePtr || SrcReg != StackPtr) - return 0; + return CU::UNWIND_MODE_DWARF; StackAdjust = 0; memset(SavedRegs, 0, sizeof(SavedRegs)); @@ -552,7 +548,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { Opc == X86::SUB32ri || Opc == X86::SUB32ri8) { if (StackSize) // We already have a stack size. - return 0; + return CU::UNWIND_MODE_DWARF; if (!MI.getOperand(0).isReg() || MI.getOperand(0).getReg() != MI.getOperand(1).getReg() || @@ -560,7 +556,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // We need this to be a stack adjustment pointer. Something like: // // %RSP<def> = SUB64ri8 %RSP, 48 - return 0; + return CU::UNWIND_MODE_DWARF; StackSize = MI.getOperand(2).getImm() / StackDivide; SubtractInstrIdx += InstrOffset; @@ -574,31 +570,31 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (HasFP) { if ((StackAdjust & 0xFF) != StackAdjust) // Offset was too big for compact encoding. - return 0; + return CU::UNWIND_MODE_DWARF; // Get the encoding of the saved registers when we have a frame pointer. uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); - if (RegEnc == ~0U) return 0; + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; - CompactUnwindEncoding |= 0x01000000; + CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME; CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; - CompactUnwindEncoding |= RegEnc & 0x7FFF; + CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS; } else { ++StackAdjust; uint32_t TotalStackSize = StackAdjust + StackSize; if ((TotalStackSize & 0xFF) == TotalStackSize) { // Frameless stack with a small stack size. - CompactUnwindEncoding |= 0x02000000; + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD; // Encode the stack size. CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16; } else { if ((StackAdjust & 0x7) != StackAdjust) // The extra stack adjustments are too big for us to handle. - return 0; + return CU::UNWIND_MODE_DWARF; // Frameless stack with an offset too large for us to encode compactly. - CompactUnwindEncoding |= 0x03000000; + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND; // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' // instruction. @@ -616,10 +612,11 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx, Is64Bit); - if (RegEnc == ~0U) return 0; + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; // Encode the register encoding. - CompactUnwindEncoding |= RegEnc & 0x3FF; + CompactUnwindEncoding |= + RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION; } return CompactUnwindEncoding; @@ -734,7 +731,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // REG < 64 => DW_CFA_offset + Reg // ELSE => DW_CFA_offset_extended - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); uint64_t NumBytes = 0; int stackGrowth = -SlotSize; @@ -767,20 +763,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(FrameLabel); // Define the current CFA rule to use the provided offset. - if (StackSize) { - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - } else { - MachineLocation SPDst(StackPtr); - MachineLocation SPSrc(StackPtr, stackGrowth); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - } + assert(StackSize); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. - MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth); - MachineLocation FPSrc(FramePtr); - Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr, + 2 * stackGrowth)); } // Update EBP with the new base value. @@ -796,9 +786,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(FrameLabel); // Define the current CFA to use the EBP/RBP register. - MachineLocation FPDst(FramePtr); - MachineLocation FPSrc(MachineLocation::VirtualFP); - Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr)); } // Mark the FramePtr as live-in in every block except the entry. @@ -826,10 +816,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); // Define the current CFA rule to use the provided offset. - unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr; - MachineLocation SPDst(Ptr); - MachineLocation SPSrc(Ptr, StackOffset); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + assert(StackSize); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(Label, StackOffset)); StackOffset += stackGrowth; } } @@ -925,11 +914,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - // MSVC x64's __chkstk needs to adjust %rsp. - // FIXME: %rax preserves the offset and should be available. - if (isSPUpdateNeeded) - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, - UseLEA, TII, *RegInfo); + // MSVC x64's __chkstk does not adjust %rsp itself. + // It also does not clobber %rax so we can reuse it when adjusting %rsp. + if (isSPUpdateNeeded) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) + .addReg(StackPtr) + .addReg(X86::RAX) + .setMIFlag(MachineInstr::FrameSetup); + } if (isEAXAlive) { // Restore EAX @@ -963,16 +955,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. - if (StackSize) { - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, - -StackSize + stackGrowth); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); - } else { - MachineLocation SPDst(StackPtr); - MachineLocation SPSrc(StackPtr, stackGrowth); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); - } + assert(StackSize); + MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset( + Label, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. @@ -1338,7 +1323,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area @@ -1351,7 +1336,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // } // [EBP] MFI->CreateFixedObject(-TailCallReturnAddrDelta, - (-1U*SlotSize)+TailCallReturnAddrDelta, true); + TailCallReturnAddrDelta - SlotSize, true); } if (hasFP(MF)) { diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 3f08b9a..6e309d8 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -19,8 +19,35 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - class MCSymbol; - class X86TargetMachine; + +namespace CU { + + /// Compact unwind encoding values. + enum CompactUnwindEncodings { + /// [RE]BP based frame where [RE]BP is pused on the stack immediately after + /// the return address, then [RE]SP is moved to [RE]BP. + UNWIND_MODE_BP_FRAME = 0x01000000, + + /// A frameless function with a small constant stack size. + UNWIND_MODE_STACK_IMMD = 0x02000000, + + /// A frameless function with a large constant stack size. + UNWIND_MODE_STACK_IND = 0x03000000, + + /// No compact unwind encoding is available. + UNWIND_MODE_DWARF = 0x04000000, + + /// Mask for encoding the frame registers. + UNWIND_BP_FRAME_REGISTERS = 0x00007FFF, + + /// Mask for encoding the frameless registers. + UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF + }; + +} // end CU namespace + +class MCSymbol; +class X86TargetMachine; class X86FrameLowering : public TargetFrameLowering { const X86TargetMachine &TM; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 00fbe69..9465420 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -141,10 +141,6 @@ namespace { /// SelectionDAG operations. /// class X86DAGToDAGISel : public SelectionDAGISel { - /// X86Lowering - This object fully describes how to lower LLVM code to an - /// X86-specific SelectionDAG. - const X86TargetLowering &X86Lowering; - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; @@ -156,7 +152,6 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), - X86Lowering(*tm.getTargetLowering()), Subtarget(&tm.getSubtarget<X86Subtarget>()), OptForSize(false) {} @@ -200,9 +195,13 @@ namespace { bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + bool SelectMOV64Imm32(SDValue N, SDValue &Imm); bool SelectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + bool SelectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); bool SelectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); @@ -229,14 +228,15 @@ namespace { SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) : + CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, + getTargetLowering()->getPointerTy()) : AM.Base_Reg; Scale = getI8Imm(AM.Scale); Index = AM.IndexReg; // These are 32-bit even in 64-bit mode since RIP relative offset // is 32-bit. if (AM.GV) - Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(), + Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), MVT::i32, AM.Disp, AM.SymbolFlags); else if (AM.CP) @@ -373,7 +373,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, else Ops.push_back(Chain.getOperand(i)); SDValue NewChain = - CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(), + CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, &Ops[0], Ops.size()); Ops.clear(); Ops.push_back(NewChain); @@ -444,7 +444,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - (N->getOpcode() == X86ISD::CALL || + // Only does this when target favors doesn't favor register indirect + // call. + ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || (N->getOpcode() == X86ISD::TC_RETURN && // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || @@ -498,8 +500,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. - bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); - bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(getTargetLowering()); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; @@ -522,7 +526,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { MemVT = SrcIsSSE ? SrcVT : DstVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, @@ -780,7 +784,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, return true; EVT VT = N.getValueType(); - DebugLoc DL = N.getDebugLoc(); + SDLoc DL(N); SDValue Eight = DAG.getConstant(8, MVT::i8); SDValue NewMask = DAG.getConstant(0xff, VT); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); @@ -828,7 +832,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, return true; EVT VT = N.getValueType(); - DebugLoc DL = N.getDebugLoc(); + SDLoc DL(N); SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); @@ -884,8 +888,8 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); - unsigned MaskLZ = CountLeadingZeros_64(Mask); - unsigned MaskTZ = CountTrailingZeros_64(Mask); + unsigned MaskLZ = countLeadingZeros(Mask); + unsigned MaskTZ = countTrailingZeros(Mask); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. @@ -930,11 +934,11 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, if (ReplacingAnyExtend) { assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. - SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X); + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); InsertDAGNode(DAG, N, NewX); X = NewX; } - DebugLoc DL = N.getDebugLoc(); + SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); @@ -958,7 +962,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { - DebugLoc dl = N.getDebugLoc(); + SDLoc dl(N); DEBUG({ dbgs() << "MatchAddress: "; AM.dump(); @@ -1378,6 +1382,71 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, } +bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { + if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CN->getZExtValue(); + if ((uint32_t)ImmVal != (uint64_t)ImmVal) + return false; + + Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64); + return true; + } + + // In static codegen with small code model, we can get the address of a label + // into a register with 'movl'. TableGen has already made sure we're looking + // at a label of some kind. + assert(N->getOpcode() == X86ISD::Wrapper && + "Unexpected node type for MOV32ri64"); + N = N.getOperand(0); + + if (N->getOpcode() != ISD::TargetConstantPool && + N->getOpcode() != ISD::TargetJumpTable && + N->getOpcode() != ISD::TargetGlobalAddress && + N->getOpcode() != ISD::TargetExternalSymbol && + N->getOpcode() != ISD::TargetBlockAddress) + return false; + + Imm = N; + return TM.getCodeModel() == CodeModel::Small; +} + +bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + return false; + + SDLoc DL(N); + RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); + if (RN && RN->getReg() == 0) + Base = CurDAG->getRegister(0, MVT::i64); + else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) { + // Base could already be %rip, particularly in the x32 ABI. + Base = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), + Base, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + + RN = dyn_cast<RegisterSDNode>(Index); + if (RN && RN->getReg() == 0) + Index = CurDAG->getRegister(0, MVT::i64); + else { + assert(Index.getValueType() == MVT::i32 && + "Expect to be extending 32-bit registers for use in LEA"); + Index = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), + Index, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + + return true; +} + /// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, @@ -1485,7 +1554,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode(); + return CurDAG->getRegister(GlobalBaseReg, + getTargetLowering()->getPointerTy()).getNode(); } SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { @@ -1500,9 +1570,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), - MVT::i32, MVT::i32, MVT::Other, Ops, - array_lengthof(Ops)); + SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), + MVT::i32, MVT::i32, MVT::Other, Ops); cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); return ResNode; } @@ -1636,7 +1705,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { // + empty, the operand is not needed any more with the new op selected. // + non-empty, otherwise. static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, - DebugLoc dl, + SDLoc dl, enum AtomicOpc &Op, EVT NVT, SDValue Val) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { @@ -1688,7 +1757,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); // Optimize common patterns for __sync_or_and_fetch and similar arith // operations where the result is not used. This allows us to use the "lock" @@ -1718,7 +1787,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { Op = ADD; break; } - + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); bool isUnOp = !Val.getNode(); bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); @@ -1770,12 +1839,10 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); if (isUnOp) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; - Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, - array_lengthof(Ops)), 0); + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } else { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; - Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, - array_lengthof(Ops)), 0); + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); SDValue RetVals[] = { Undef, Ret }; @@ -1921,7 +1988,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, if (ChainCheck) // Make a new TokenFactor with all the other input chains except // for the load. - InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(), + InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, &ChainOps[0], ChainOps.size()); } if (!ChainCheck) @@ -1969,8 +2036,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { SDValue Segment = CurDAG->getRegister(0, MVT::i32); const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, Disp, Segment, VMask, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), - VTs, Ops, array_lengthof(Ops)); + SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops); // Node has 2 outputs: VDst and MVT::Other. // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. // We replace VDst of Node with VDst of ResNode, and Other of Node with Other @@ -1984,7 +2050,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT NVT = Node->getValueType(0); unsigned Opc, MOpc; unsigned Opcode = Node->getOpcode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); @@ -2015,6 +2081,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_avx2_gather_d_d_256: case Intrinsic::x86_avx2_gather_q_d: case Intrinsic::x86_avx2_gather_q_d_256: { + if (!Subtarget->hasAVX2()) + break; unsigned Opc; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); @@ -2184,7 +2252,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); SDValue Ops[] = {N1, InFlag}; - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); @@ -2265,16 +2333,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { InFlag }; if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, - array_lengthof(Ops)); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); Chain = SDValue(CNode, 2); InFlag = SDValue(CNode, 3); } else { SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, - array_lengthof(Ops)); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); Chain = SDValue(CNode, 0); InFlag = SDValue(CNode, 1); } @@ -2285,15 +2351,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { N1, InFlag }; if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, - array_lengthof(Ops)); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); InFlag = SDValue(CNode, 2); } else { SDVTList VTs = CurDAG->getVTList(MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, - array_lengthof(Ops)); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); InFlag = SDValue(CNode, 0); } } @@ -2369,27 +2433,24 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } unsigned LoReg, HiReg, ClrReg; - unsigned ClrOpcode, SExtOpcode; + unsigned SExtOpcode; switch (NVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; - ClrOpcode = 0; SExtOpcode = X86::CBW; break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; - ClrOpcode = X86::MOV16r0; ClrReg = X86::DX; + ClrReg = X86::DX; SExtOpcode = X86::CWD; break; case MVT::i32: LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; - ClrOpcode = X86::MOV32r0; SExtOpcode = X86::CDQ; break; case MVT::i64: LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; - ClrOpcode = X86::MOV64r0; SExtOpcode = X86::CQO; break; } @@ -2407,8 +2468,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, - MVT::Other, Ops, - array_lengthof(Ops)), 0); + MVT::Other, Ops), 0); Chain = Move.getValue(1); ReplaceUses(N0.getValue(1), Chain); } else { @@ -2428,8 +2488,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - SDValue ClrNode = - SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0); + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + switch (NVT.getSimpleVT().SimpleTy) { + case MVT::i16: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, + CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)), + 0); + break; + case MVT::i32: + break; + case MVT::i64: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), ClrNode, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + break; + default: + llvm_unreachable("Unexpected division source"); + } + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, ClrNode, InFlag).getValue(1); } @@ -2439,8 +2520,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, - array_lengthof(Ops)); + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); @@ -2451,6 +2531,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Prevent use of AH in a REX instruction by referencing AX instead. // Shift it down 8 bits. + // + // The current assumption of the register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. if (HiReg == X86::AH && Subtarget->is64Bit() && !SDValue(Node, 1).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, @@ -2671,9 +2756,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT LdVT = LoadNode->getMemoryVT(); unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); MachineSDNode *Result = CurDAG->getMachineNode(newOpc, - Node->getDebugLoc(), - MVT::i32, MVT::Other, Ops, - array_lengthof(Ops)); + SDLoc(Node), + MVT::i32, MVT::Other, Ops); Result->setMemRefs(MemOp, MemOp + 2); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e6858bc..9ffe29f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -55,20 +55,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); // Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 instruction or a -/// simple subregister reference. Idx is an index in the 128 bits we -/// want. It need not be aligned to a 128-bit bounday. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, DebugLoc dl) { +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); EVT VT = Vec.getValueType(); - assert(VT.is256BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/128; + unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); @@ -76,13 +73,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); - // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR - // we can match to VEXTRACTF128. - unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) * ElemsPerChunk); // If the input is a buildvector just emit a smaller one. @@ -95,38 +91,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, VecIdx); return Result; + +} +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128 instruction or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - DebugLoc dl) { +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; - EVT VT = Vec.getValueType(); - assert(VT.is128BitVector() && "Unexpected vector size!"); - EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); - // Insert the relevant 128 bits. - unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of @@ -134,11 +163,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); bool is64Bit = Subtarget->is64Bit(); @@ -163,10 +199,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - - RegInfo = TM.getRegisterInfo(); TD = getDataLayout(); + resetOperationActions(); +} + +void X86TargetLowering::resetOperationActions() { + const TargetMachine &TM = getTargetMachine(); + static bool FirstTimeThrough = true; + + // If none of the target options have changed, then we don't need to reset the + // operation actions. + if (!FirstTimeThrough && TO == TM.Options) return; + + if (!FirstTimeThrough) { + // Reinitialize the actions. + initActions(); + FirstTimeThrough = false; + } + + TO = TM.Options; + // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -184,6 +237,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2 @@ -470,7 +525,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); - // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build @@ -508,16 +563,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); - setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); - // On X86 and X86-64, atomic operations are lowered to locked instructions. - // Locked instructions, in turn, have implicit fence semantics (all memory - // operations are flushed before issuing the locked instruction, and they - // are not buffered), so we can fold away the common pattern of - // fence-atomic-fence. - setShouldFoldAtomicFences(true); - // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; @@ -552,10 +599,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); if (Subtarget->is64Bit()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); @@ -575,10 +618,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit()) { + if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { + // TargetInfo::X86_64ABIBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { + // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } @@ -989,7 +1034,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } - if (Subtarget->hasSSE41()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -1053,23 +1098,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); - if (Subtarget->hasInt256()) { - setOperationAction(ISD::SRL, MVT::v2i64, Legal); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); - - setOperationAction(ISD::SHL, MVT::v2i64, Legal); - setOperationAction(ISD::SHL, MVT::v4i32, Legal); + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Legal); - } else { - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v4i32, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); - } setOperationAction(ISD::SDIV, MVT::v8i16, Custom); setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } @@ -1118,6 +1156,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); @@ -1186,14 +1225,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - setOperationAction(ISD::SRL, MVT::v4i64, Legal); - setOperationAction(ISD::SRL, MVT::v8i32, Legal); - - setOperationAction(ISD::SHL, MVT::v4i64, Legal); - setOperationAction(ISD::SHL, MVT::v8i32, Legal); - - setOperationAction(ISD::SRA, MVT::v8i32, Legal); - setOperationAction(ISD::SDIV, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); @@ -1210,15 +1241,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); // Don't lower v32i8 because there is no 128-bit byte mul + } - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); - } + setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (int i = MVT::FIRST_VECTOR_VALUETYPE; @@ -1264,6 +1297,150 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + + addRegisterClass(MVT::v8i1, &X86::VK8RegClass); + addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v16f32, Legal); + setOperationAction(ISD::LOAD, MVT::v8f64, Legal); + setOperationAction(ISD::LOAD, MVT::v8i64, Legal); + setOperationAction(ISD::LOAD, MVT::v16i32, Legal); + setOperationAction(ISD::LOAD, MVT::v16i1, Legal); + + setOperationAction(ISD::FADD, MVT::v16f32, Legal); + setOperationAction(ISD::FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + + setOperationAction(ISD::FADD, MVT::v8f64, Legal); + setOperationAction(ISD::FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FMA, MVT::v8f64, Legal); + setOperationAction(ISD::FMA, MVT::v16f32, Legal); + setOperationAction(ISD::SDIV, MVT::v16i32, Custom); + + + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + + setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + + setOperationAction(ISD::SETCC, MVT::v16i1, Custom); + setOperationAction(ISD::SETCC, MVT::v8i1, Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8f64, Custom); + setOperationAction(ISD::SELECT, MVT::v8i64, Custom); + setOperationAction(ISD::SELECT, MVT::v16f32, Custom); + + setOperationAction(ISD::ADD, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v16i32, Legal); + + setOperationAction(ISD::SUB, MVT::v8i64, Legal); + setOperationAction(ISD::SUB, MVT::v16i32, Legal); + + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::SRL, MVT::v8i64, Custom); + setOperationAction(ISD::SRL, MVT::v16i32, Custom); + + setOperationAction(ISD::SHL, MVT::v8i64, Custom); + setOperationAction(ISD::SHL, MVT::v16i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i64, Custom); + setOperationAction(ISD::SRA, MVT::v16i32, Custom); + + setOperationAction(ISD::AND, MVT::v8i64, Legal); + setOperationAction(ISD::OR, MVT::v8i64, Legal); + setOperationAction(ISD::XOR, MVT::v8i64, Legal); + + // Custom lower several nodes. + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Extract subvector is special because the value type + // (result) is 256/128-bit but the source is 512-bit wide. + if (VT.is128BitVector() || VT.is256BitVector()) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + if (VT.getVectorElementType() == MVT::i1) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + // Do not attempt to custom lower other non-512-bit vectors + if (!VT.is512BitVector()) + continue; + + if (VT != MVT::v8i64) { + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + } + if ( EltSize >= 32) { + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } + } + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Do not attempt to promote non-256-bit vectors + if (!VT.is512BitVector()) + continue; + + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v8i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); + } + }// has AVX-512 + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -1356,7 +1533,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(4); // 2^4 bytes. - BenefitFromCodePlacementOpt = true; // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); @@ -1364,7 +1540,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setPrefFunctionAlignment(4); // 2^4 bytes. } -EVT X86TargetLowering::getSetCCResultType(EVT VT) const { +EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i8; return VT.changeVectorElementTypeToInteger(); } @@ -1507,9 +1683,9 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) - // This doesn't have DebugLoc associated with it, but is not really the + // This doesn't have SDLoc associated with it, but is not really the // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); return Table; } @@ -1596,7 +1772,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); @@ -1679,10 +1855,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. - if (Subtarget->is64Bit() && - DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { + if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); unsigned Reg = FuncInfo->getSRetReturnReg(); @@ -1690,12 +1867,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); - unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX; + unsigned RetValReg + = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? + X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. - RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64)); + RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); } RetOps[0] = Chain; // Update chain. @@ -1761,7 +1940,7 @@ SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { // Assign locations to each value returned by this call. @@ -1795,7 +1974,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; SDValue Ops[] = { Chain, InFlag }; Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, - MVT::Other, MVT::Glue, Ops, 2), 1); + MVT::Other, MVT::Glue, Ops), 1); Val = Chain.getValue(0); // Round the f80 to the right size, which also moves it to the appropriate @@ -1868,7 +2047,7 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), @@ -1883,13 +2062,19 @@ static bool IsTailCallConvention(CallingConv::ID CC) { CC == CallingConv::HiPE); } +/// \brief Return true if the calling convention is a C calling convention. +static bool IsCCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || + CC == CallingConv::X86_64_SysV); +} + bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); - if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; return true; @@ -1906,7 +2091,7 @@ SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { @@ -1948,7 +2133,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { @@ -1964,7 +2149,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWindows = Subtarget->isTargetWindows(); - bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); @@ -1975,9 +2160,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (IsWin64) { + if (IsWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeFormalArguments(Ins, CC_X86); @@ -2003,12 +2187,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2049,9 +2239,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. // Save the argument into a virtual register so that we can access it // from the return points. - if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { + if (MF.getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { @@ -2223,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); @@ -2243,7 +2435,7 @@ SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, - int FPDiff, DebugLoc dl) const { + int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); @@ -2259,12 +2451,13 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, DebugLoc dl) { + unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); + MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), @@ -2276,10 +2469,10 @@ SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; - SmallVector<SDValue, 32> &OutVals = CLI.OutVals; - SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; @@ -2288,7 +2481,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); - bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); bool IsWindows = Subtarget->isTargetWindows(); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; @@ -2321,9 +2514,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (IsWin64) { + if (IsWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(Outs, CC_X86); @@ -2352,7 +2544,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -2366,6 +2559,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); @@ -2441,7 +2636,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -2638,7 +2833,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -2697,7 +2892,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(NumBytesForCalleeToPush, true), - InFlag); + InFlag, dl); InFlag = Chain.getValue(1); } @@ -2745,6 +2940,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; @@ -2829,13 +3026,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { - if (!IsTailCallConvention(CalleeCC) && - CalleeCC != CallingConv::C) + if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); - const Function *CallerF = DAG.getMachineFunction().getFunction(); + const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to @@ -2845,6 +3041,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); + bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) @@ -2857,6 +3055,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -2876,7 +3076,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. - if (Subtarget->isTargetWin64()) + if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector<CCValAssign, 16> ArgLocs; @@ -2951,9 +3151,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { + if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { @@ -3060,7 +3259,7 @@ static bool isTargetShuffle(unsigned Opcode) { } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3071,7 +3270,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3085,7 +3284,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3098,7 +3297,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3117,13 +3316,16 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, + -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3626,7 +3828,7 @@ static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getValueType(0).getSimpleVT(); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); if (VT != MVT::v8i32 && VT != MVT::v8f32) return SDValue(); @@ -3684,12 +3886,10 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { @@ -3723,11 +3923,10 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { @@ -3768,12 +3967,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; @@ -3803,11 +4000,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) @@ -4039,42 +4235,59 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { return true; } -/// isVEXTRACTF128Index - Return true if the specified +/// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for input to VEXTRACTF128. -bool X86::isVEXTRACTF128Index(SDNode *N) { +/// suitable for instruction that extract 128 or 256 bit vectors +static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) return false; - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getValueType(0).getSimpleVT(); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } -/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to -/// VINSERTF128. -bool X86::isVINSERTF128Index(SDNode *N) { +/// insertion of 128 or 256-bit subvectors +static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) return false; - - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getValueType(0).getSimpleVT(); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } +bool X86::isVINSERT128Index(SDNode *N) { + return isVINSERTIndex(N, 128); +} + +bool X86::isVINSERT256Index(SDNode *N) { + return isVINSERTIndex(N, 256); +} + +bool X86::isVEXTRACT128Index(SDNode *N) { + return isVEXTRACTIndex(N, 128); +} + +bool X86::isVEXTRACT256Index(SDNode *N) { + return isVEXTRACTIndex(N, 256); +} + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. @@ -4178,12 +4391,10 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } -/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// instructions. -unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { +static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + llvm_unreachable("Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); @@ -4191,16 +4402,14 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } -/// getInsertVINSERTF128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// instructions. -unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { +static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERTF128"); + llvm_unreachable("Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); @@ -4208,10 +4417,38 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { MVT VecVT = N->getValueType(0).getSimpleVT(); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } +/// getExtractVEXTRACT128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// and VINSERTI128 instructions. +unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 128); +} + +/// getExtractVEXTRACT256Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 256); +} + +/// getInsertVINSERT128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// and VINSERTI128 instructions. +unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 128); +} + +/// getInsertVINSERT256Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 256); +} + /// getShuffleCLImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. /// Handles 256-bit. @@ -4261,7 +4498,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, } MaskVec.push_back(Idx); } - return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), + return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), SVOp->getOperand(0), &MaskVec[0]); } @@ -4394,7 +4631,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, - SelectionDAG &DAG, DebugLoc dl) { + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted @@ -4412,13 +4649,15 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, + array_lengthof(Ops)); } } else llvm_unreachable("Unexpected vector type"); @@ -4431,7 +4670,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); @@ -4439,7 +4678,8 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); @@ -4464,7 +4704,7 @@ static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4475,7 +4715,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, } /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. -static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4487,7 +4727,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, } /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. -static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4505,7 +4745,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { EVT VT = V.getValueType(); int NumElems = VT.getVectorNumElements(); - DebugLoc dl = V.getDebugLoc(); + SDLoc dl(V); while (NumElems > 4) { if (EltNo < NumElems/2) { @@ -4522,7 +4762,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { /// getLegalSplat - Generate a legal splat with supported x86 shuffles static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { EVT VT = V.getValueType(); - DebugLoc dl = V.getDebugLoc(); + SDLoc dl(V); if (VT.is128BitVector()) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); @@ -4549,7 +4789,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { EVT SrcVT = SV->getValueType(0); SDValue V1 = SV->getOperand(0); - DebugLoc dl = SV->getDebugLoc(); + SDLoc dl(SV); int EltNo = SV->getSplatIndex(); int NumElems = SrcVT.getVectorNumElements(); @@ -4594,13 +4834,13 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, SelectionDAG &DAG) { EVT VT = V2.getValueType(); SDValue V1 = IsZero - ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 16> MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); - return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); + return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the @@ -4751,19 +4991,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. -static -unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, - bool ZerosFromLeft, SelectionDAG &DAG) { - unsigned i; - for (i = 0; i != NumElems; ++i) { - unsigned Index = ZerosFromLeft ? i : NumElems-i-1; +/// We count undefs as zeros until PreferredNum is reached. +static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, + unsigned NumElems, bool ZerosFromLeft, + SelectionDAG &DAG, + unsigned PreferredNum = -1U) { + unsigned NumZeros = 0; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); - if (!(Elt.getNode() && - (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + if (!Elt.getNode()) + break; + + if (X86::isZeroNode(Elt)) + ++NumZeros; + else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. + NumZeros = std::min(NumZeros + 1, PreferredNum); + else break; } - return i; + return NumZeros; } /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) @@ -4801,8 +5049,9 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - false /* check zeros from right */, DAG); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, false /* check zeros from right */, DAG, + SVOp->getMaskElt(0)); unsigned OpSrc; if (!NumZeros) @@ -4834,8 +5083,9 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - true /* check zeros from left */, DAG); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, true /* check zeros from left */, DAG, + NumElems - SVOp->getMaskElt(NumElems - 1) - 1); unsigned OpSrc; if (!NumZeros) @@ -4888,7 +5138,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (NumNonZero > 8) return SDValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 16; ++i) { @@ -4936,7 +5186,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (NumNonZero > 4) return SDValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 8; ++i) { @@ -4962,7 +5212,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, - const TargetLowering &TLI, DebugLoc dl) { + const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; @@ -4974,7 +5224,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, } SDValue -X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, +X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, SelectionDAG &DAG) const { // Check if the scalar load can be widened into a vector load. And if @@ -5027,7 +5277,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) - Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; @@ -5038,10 +5288,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector<int, 8> Mask; - for (unsigned i = 0; i != NumElems; ++i) - Mask.push_back(EltNo); - + SmallVector<int, 8> Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -5058,7 +5305,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, - DebugLoc &DL, SelectionDAG &DAG) { + SDLoc &DL, SelectionDAG &DAG) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); @@ -5094,22 +5341,35 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + SDValue NewLd = SDValue(); if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; } if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, + array_lengthof(Ops), MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, @@ -5144,9 +5404,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { return SDValue(); MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; @@ -5202,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. - if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + + // AVX-512 has register version of the broadcast + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + Ld.getValueType().getSizeInBits() >= 32; + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && + !hasRegVer)) return SDValue(); break; } } - bool Is256 = VT.is256BitVector(); + bool IsGE256 = (VT.getSizeInBits() >= 256); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -5218,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5242,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && - (ScalarSize == 32 || (Is256 && ScalarSize == 64))) + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -5271,7 +5536,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; @@ -5337,14 +5602,120 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { return NV; } +// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. +SDValue +X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && + "Unexpected type in LowerBUILD_VECTORvXi1!"); + + SDLoc dl(Op); + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(1, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + bool AllContants = true; + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() == ISD::UNDEF) + continue; + if (!isa<ConstantSDNode>(In)) { + AllContants = false; + break; + } + if (cast<ConstantSDNode>(In)->getZExtValue()) + Immediate |= (1ULL << idx); + } + + if (AllContants) { + SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, + DAG.getConstant(Immediate, MVT::i16)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, + DAG.getIntPtrConstant(0)); + } + + if (!isSplatVector(Op.getNode())) + llvm_unreachable("Unsupported predicate operation"); + + SDValue In = Op.getOperand(0); + SDValue EFLAGS, X86CC; + if (In.getOpcode() == ISD::SETCC) { + SDValue Op0 = In.getOperand(0); + SDValue Op1 = In.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); + bool isFP = Op1.getValueType().isFloatingPoint(); + unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + + assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); + + X86CC = DAG.getConstant(X86CCVal, MVT::i8); + EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + } else if (In.getOpcode() == X86ISD::SETCC) { + X86CC = In.getOperand(0); + EFLAGS = In.getOperand(1); + } else { + // The algorithm: + // Bit1 = In & 0x1 + // if (Bit1 != 0) + // ZF = 0 + // else + // ZF = 1 + // if (ZF == 0) + // res = allOnes ### CMOVNE -1, %res + // else + // res = allZero + MVT InVT = In.getValueType().getSimpleVT(); + SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); + EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); + X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); + } + + if (VT == MVT::v16i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i16); + SDValue Cst0 = DAG.getConstant(0, MVT::i16); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, + Cst0, Cst1, X86CC, EFLAGS); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + + if (VT == MVT::v8i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i32); + SDValue Cst0 = DAG.getConstant(0, MVT::i32); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, + Cst0, Cst1, X86CC, EFLAGS); + CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + llvm_unreachable("Unsupported predicate operation"); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT VT = Op.getValueType().getSimpleVT(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); + // Generate vectors for predicate vectors. + if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + return LowerBUILD_VECTORvXi1(Op, DAG); + // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd @@ -5398,7 +5769,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of @@ -5507,7 +5878,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); @@ -5542,7 +5913,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); @@ -5672,22 +6043,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT ResVT = Op.getValueType().getSimpleVT(); - assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); + assert((ResVT.is256BitVector() || + ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); + if(ResVT.is256BitVector()) + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); - return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); - // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. return LowerAVXCONCAT_VECTORS(Op, DAG); } @@ -5698,7 +6072,7 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); MVT VT = SVOp->getValueType(0).getSimpleVT(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); @@ -5759,7 +6133,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); SmallVector<int, 8> MaskVals; // Determine if more than 1 of the words in each of the low and high quadwords @@ -6014,7 +6388,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, const X86TargetLowering &TLI) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); ArrayRef<int> MaskVals = SVOp->getMask(); // Promote splats to a larger type which usually leads to more efficient code. @@ -6143,7 +6517,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, MVT VT = SVOp->getValueType(0).getSimpleVT(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -6189,7 +6563,7 @@ static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getValueType(0).getSimpleVT(); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; @@ -6227,7 +6601,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, /// static SDValue getVZextMovL(MVT VT, EVT OpVT, SDValue SrcOp, SelectionDAG &DAG, - const X86Subtarget *Subtarget, DebugLoc dl) { + const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { LoadSDNode *LD = NULL; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) @@ -6272,7 +6646,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); MVT EltVT = VT.getVectorElementType(); MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; @@ -6378,7 +6752,7 @@ static SDValue LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); MVT VT = SVOp->getValueType(0).getSimpleVT(); assert(VT.is128BitVector() && "Unsupported vector size"); @@ -6529,7 +6903,7 @@ static bool MayFoldVectorLoad(SDValue V) { } static -SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { +SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Canonizalize to v2f64. @@ -6540,7 +6914,7 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { } static -SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, +SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -6559,7 +6933,7 @@ SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, } static -SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { +SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6575,7 +6949,7 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { } static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { +SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6645,7 +7019,7 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { return SDValue(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = VT.getVectorNumElements(); @@ -6706,10 +7080,10 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); EVT FullVT = V.getValueType(); - EVT SubVecVT = EVT::getVectorVT(*Context, + EVT SubVecVT = EVT::getVectorVT(*Context, FullVT.getVectorElementType(), FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, DAG.getIntPtrConstant(0)); } V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); @@ -6724,7 +7098,7 @@ SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -6783,7 +7157,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -6865,6 +7239,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { TargetMask, DAG); } + if (isPALIGNRMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, + getShufflePALIGNRImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -6982,11 +7361,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, - getShufflePALIGNRImmediate(SVOp), - DAG); - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -7094,7 +7468,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) return SDValue(); @@ -7157,6 +7531,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); @@ -7165,17 +7540,19 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. - if (VecVT.is256BitVector()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); - unsigned NumElems = VecVT.getVectorNumElements(); + if (VecVT.is256BitVector() || VecVT.is512BitVector()) { SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + EVT EltVT = VecVT.getVectorElementType(); + + unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); - if (IdxVal >= NumElems/2) - IdxVal -= NumElems/2; + //if (IdxVal >= NumElems/2) + // IdxVal -= NumElems/2; + IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, MVT::i32)); } @@ -7189,7 +7566,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, } MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); @@ -7248,7 +7624,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType().getSimpleVT(); MVT EltVT = VT.getVectorElementType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); @@ -7303,26 +7679,27 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getValueType().getSimpleVT(); MVT EltVT = VT.getVectorElementType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); // If this is a 256-bit vector result, first extract the 128-bit vector, // insert the element into the extracted half and then place it back. - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { if (!isa<ConstantSDNode>(N2)) return SDValue(); // Get the desired 128-bit vector half. - unsigned NumElems = VT.getVectorNumElements(); unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - bool Upper = IdxVal >= NumElems/2; + unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); + DAG.getConstant(IdxIn128, MVT::i32)); // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); @@ -7348,16 +7725,17 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT OpVT = Op.getValueType().getSimpleVT(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. + unsigned SizeFactor = OpVT.getSizeInBits()/128; EVT VT128 = EVT::getVectorVT(*Context, OpVT.getVectorElementType(), - OpVT.getVectorNumElements() / 2); + OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); @@ -7380,16 +7758,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); - SDValue Vec = Op.getNode()->getOperand(0); - SDValue Idx = Op.getNode()->getOperand(1); + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + EVT ResVT = Op.getValueType(); + EVT InVT = In.getValueType(); - if (Op.getNode()->getValueType(0).is128BitVector() && - Vec.getNode()->getValueType(0).is256BitVector() && + if (Subtarget->hasFp256()) { + if (ResVT.is128BitVector() && + (InVT.is256BitVector() || InVT.is512BitVector()) && isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Extract128BitVector(Vec, IdxVal, DAG, dl); + return Extract128BitVector(In, IdxVal, DAG, dl); + } + if (ResVT.is256BitVector() && InVT.is512BitVector() && + isa<ConstantSDNode>(Idx)) { + return Extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); @@ -7401,17 +7785,25 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (Subtarget->hasFp256()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); + SDLoc dl(Op.getNode()); SDValue Vec = Op.getNode()->getOperand(0); SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).is256BitVector() && + if ((Op.getNode()->getValueType(0).is256BitVector() || + Op.getNode()->getValueType(0).is512BitVector()) && SubVec.getNode()->getValueType(0).is128BitVector() && isa<ConstantSDNode>(Idx)) { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } + + if (Op.getNode()->getValueType(0).is512BitVector() && + SubVec.getNode()->getValueType(0).is256BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + } } return SDValue(); } @@ -7443,13 +7835,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), CP->getAlignment(), CP->getOffset(), OpFlag); - DebugLoc DL = CP->getDebugLoc(); + SDLoc DL(CP); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); } @@ -7475,14 +7867,14 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), OpFlag); - DebugLoc DL = JT->getDebugLoc(); + SDLoc DL(JT); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); return Result; @@ -7513,7 +7905,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. @@ -7521,7 +7913,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); } @@ -7542,7 +7934,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { CodeModel::Model M = getTargetMachine().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, OpFlags); @@ -7563,7 +7955,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { } SDValue -X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. @@ -7612,7 +8004,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); - return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); + return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); } static SDValue @@ -7621,7 +8013,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), @@ -7632,10 +8024,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -7650,10 +8042,10 @@ static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; - DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better + SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), PtrVT), InFlag); + SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -7671,7 +8063,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo* MFI = DAG.getMachineFunction() @@ -7685,7 +8077,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); @@ -7710,7 +8102,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), @@ -7749,7 +8141,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } @@ -7803,7 +8195,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); @@ -7813,7 +8205,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Offset); // Lowering the machine isd will make sure everything is in the right @@ -7850,7 +8242,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // thread-localness. if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) GV = GA->resolveAliasedGlobal(false); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or @@ -7908,7 +8300,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); @@ -7945,7 +8337,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -7967,7 +8359,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); @@ -7983,7 +8375,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) @@ -8058,11 +8450,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, #endif */ - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. - const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); @@ -8112,7 +8504,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), MVT::f64); @@ -8160,7 +8552,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); EVT SVT = N0.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || SVT == MVT::v8i8 || SVT == MVT::v8i16) && @@ -8175,7 +8567,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); @@ -8228,13 +8620,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; - SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, - MVT::i64, MMO); + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, + array_lengthof(Ops), MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. - SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + SDValue SignSet = DAG.getSetCC(dl, + getSetCCResultType(*DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, MVT::i64), ISD::SETLT); @@ -8263,7 +8656,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, std::pair<SDValue,SDValue> X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT DstTy = Op.getValueType(); @@ -8321,8 +8714,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, - DstTy, MMO); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, + array_lengthof(Ops), DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -8336,7 +8729,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, 3, DstTy, MMO); + Ops, array_lengthof(Ops), DstTy, + MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, @@ -8348,8 +8742,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) - : DAG.getMergeValues(Ops, 2, DL); + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) + : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); return std::make_pair(pair, SDValue()); } } @@ -8359,7 +8753,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); MVT InVT = In.getValueType().getSimpleVT(); - DebugLoc dl = Op->getDebugLoc(); + SDLoc dl(Op); // Optimize vectors in AVX mode: // @@ -8408,7 +8802,7 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, } SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); MVT SVT = In.getValueType().getSimpleVT(); @@ -8440,7 +8834,7 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); MVT SVT = In.getValueType().getSimpleVT(); @@ -8561,8 +8955,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, MVT VT = Op.getValueType().getSimpleVT(); if (VT.isVector()) { if (VT == MVT::v8i16) - return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT, - DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, + DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), MVT::v8i32, Op.getOperand(0))); return SDValue(); } @@ -8575,7 +8969,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); @@ -8592,7 +8986,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); @@ -8601,7 +8995,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); MVT SVT = In.getValueType().getSimpleVT(); @@ -8615,7 +9009,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT VT = Op.getValueType().getSimpleVT(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; @@ -8649,7 +9043,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT VT = Op.getValueType().getSimpleVT(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; @@ -8686,7 +9080,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT VT = Op.getValueType().getSimpleVT(); MVT SrcVT = Op1.getValueType().getSimpleVT(); @@ -8763,7 +9157,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); MVT VT = Op.getValueType().getSimpleVT(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). @@ -8785,7 +9179,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, return SDValue(); SDNode *N = Op.getNode(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SmallVector<SDValue, 8> Opnds; DenseMap<SDValue, unsigned> VecInMap; @@ -8797,7 +9191,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { - SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); @@ -8869,7 +9263,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // CF and OF aren't always set the way we want. Determine which // of these we need. @@ -9084,7 +9478,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (C->getAPIntValue() == 0) return EmitTest(Op0, X86CC, DAG); - DebugLoc dl = Op0.getDebugLoc(); + SDLoc dl(Op0); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Use SUB instead of CMP to enable CSE between SUB and CMP. @@ -9111,7 +9505,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) - DebugLoc dl = Cmp.getDebugLoc(); + SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, @@ -9128,7 +9522,7 @@ static bool isAllOnes(SDValue V) { /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -9173,14 +9567,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } if (LHS.getNode()) { - // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip - // the condition code later. - bool Invert = false; - if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) { - Invert = true; - LHS = LHS.getOperand(0); - } - // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because @@ -9197,9 +9583,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; - // Flip the condition if the LHS was a not instruction - if (Invert) - Cond = X86::GetOppositeBranchCondition(Cond); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, MVT::i8), BT); } @@ -9207,6 +9590,51 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } +/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point +/// mask CMPs. +static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, + SDValue &Op1) { + unsigned SSECC; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; + } + if (Swap) + std::swap(Op0, Op1); + + return SSECC; +} + // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { @@ -9216,7 +9644,7 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors @@ -9246,7 +9674,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, MVT VT = Op.getValueType().getSimpleVT(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (isFP) { #ifndef NDEBUG @@ -9254,43 +9682,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif - unsigned SSECC; - bool Swap = false; - - // SSE Condition code mapping: - // 0 - EQ - // 1 - LT - // 2 - LE - // 3 - UNORD - // 4 - NEQ - // 5 - NLT - // 6 - NLE - // 7 - ORD - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETOEQ: - case ISD::SETEQ: SSECC = 0; break; - case ISD::SETOGT: - case ISD::SETGT: Swap = true; // Fallthrough - case ISD::SETLT: - case ISD::SETOLT: SSECC = 1; break; - case ISD::SETOGE: - case ISD::SETGE: Swap = true; // Fallthrough - case ISD::SETLE: - case ISD::SETOLE: SSECC = 2; break; - case ISD::SETUO: SSECC = 3; break; - case ISD::SETUNE: - case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; // Fallthrough - case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; // Fallthrough - case ISD::SETUGT: SSECC = 6; break; - case ISD::SETO: SSECC = 7; break; - case ISD::SETUEQ: - case ISD::SETONE: SSECC = 8; break; - } - if (Swap) - std::swap(Op0, Op1); + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { @@ -9322,8 +9714,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false; - + bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; + switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; @@ -9337,20 +9729,77 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, case ISD::SETUGE: Swap = true; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } + + // Special case: Use min/max operations for SETULE/SETUGE + MVT VET = VT.getVectorElementType(); + bool hasMinMax = + (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) + || (Subtarget->hasSSE2() && (VET == MVT::i8)); + + if (hasMinMax) { + switch (SetCCOpcode) { + default: break; + case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; + } + + if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + } + if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { - if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) - return SDValue(); + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { + assert(Subtarget->hasSSE2() && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. The lower + // compare is always unsigned. + SDValue SB; + if (FlipSigns) { + SB = DAG.getConstant(0x80000000U, MVT::v4i32); + } else { + SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); + SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Sign, Zero, Sign, Zero); + } + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); + + // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); + + // Create masks for only the low parts/high parts of the 64 bit integers. + static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskLo[] = { 0, 0, 2, 2 }; + SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); + SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); + SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); + Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); - // First cast everything to the right type, + // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); @@ -9358,7 +9807,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. - const int Mask[] = { 1, 0, 3, 2 }; + static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); @@ -9369,17 +9818,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, } } - // Since SSE has no unsigned integer comparisons, we need to flip the sign + // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { EVT EltVT = VT.getVectorElementType(); - SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), - EltVT); - std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); - SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], - SignBits.size()); - Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); - Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); @@ -9387,6 +9832,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); + + if (MinMax) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); return Result; } @@ -9400,7 +9848,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); // Optimize to BT if possible. @@ -9494,9 +9942,31 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); + EVT VT = Op1.getValueType(); SDValue CC; + // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available. Otherwise fp cmovs get lowered into a less efficient branch + // sequence later on. + if (Cond.getOpcode() == ISD::SETCC && + ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || + (Subtarget->hasSSE1() && VT == MVT::f32)) && + VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); + int SSECC = translateX86FSETCC( + cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + + if (SSECC != 8) { + unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; + SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, + DAG.getConstant(SSECC, MVT::i8)); + SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); + SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); + return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); + } + } + if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) @@ -9684,7 +10154,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); MVT InVT = In.getValueType().getSimpleVT(); - DebugLoc dl = Op->getDebugLoc(); + SDLoc dl(Op); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16)) @@ -9757,7 +10227,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue CC; bool Inverted = false; @@ -10027,7 +10497,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Get the inputs. SDValue Chain = Op.getOperand(0); @@ -10072,6 +10542,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Flag = Chain.getValue(1); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), SPTy).getValue(1); @@ -10085,7 +10557,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the @@ -10152,7 +10624,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); @@ -10218,7 +10690,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, @@ -10228,7 +10700,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); @@ -10272,7 +10744,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. @@ -10822,8 +11294,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { X86CC = X86::COND_E; break; } - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, @@ -10840,8 +11311,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { else Opcode = X86ISD::PCMPESTRI; - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } @@ -10917,33 +11387,52 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - // RDRAND intrinsics. + // RDRAND/RDSEED intrinsics. case Intrinsic::x86_rdrand_16: case Intrinsic::x86_rdrand_32: - case Intrinsic::x86_rdrand_64: { + case Intrinsic::x86_rdrand_64: + case Intrinsic::x86_rdseed_16: + case Intrinsic::x86_rdseed_32: + case Intrinsic::x86_rdseed_64: { + unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || + IntNo == Intrinsic::x86_rdseed_32 || + IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : + X86ISD::RDRAND; // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); - // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise - // return the value from Rand, which is always 0, casted to i32. + // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. + // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, Op->getValueType(1)), DAG.getConstant(X86::COND_B, MVT::i32), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops, 4); + Ops, array_lengthof(Ops)); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + + // XTEST intrinsics. + case Intrinsic::x86_xtest: { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NE, MVT::i8), + InTrans); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), + Ret, SDValue(InTrans.getNode(), 1)); + } } } @@ -10953,13 +11442,14 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, MFI->setReturnAddressIsTaken(true); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PtrVT = getPointerTy(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = - DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), @@ -10977,9 +11467,14 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, @@ -10990,6 +11485,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -10997,28 +11494,32 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); - - SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, - Subtarget->is64Bit() ? X86::RBP : X86::EBP, - getPointerTy()); - unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); + SDLoc dl (Op); - SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, - DAG.getIntPtrConstant(RegInfo->getSlotSize())); - StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); + EVT PtrVT = getPointerTy(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || + (FrameReg == X86::EBP && PtrVT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); + unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; + + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, + DAG.getIntPtrConstant(RegInfo->getSlotSize())); + StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); - return DAG.getNode(X86ISD::EH_RETURN, dl, - MVT::Other, - Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); + return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, + DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); @@ -11026,7 +11527,7 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } @@ -11041,7 +11542,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); @@ -11211,7 +11712,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); EVT VT = Op.getValueType(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); @@ -11224,7 +11725,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), - Ops, 2, MVT::i16, MMO); + Ops, array_lengthof(Ops), MVT::i16, + MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, @@ -11257,7 +11759,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -11291,7 +11793,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -11315,7 +11817,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); // Issue a bsf (scan bits forward) which also sets EFLAGS. @@ -11341,7 +11843,7 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); @@ -11377,7 +11879,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. @@ -11393,7 +11895,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, "Should not custom lower when pmuldq is available!"); // Extract the odd parts. - const int UnpackMask[] = { 1, -1, 3, -1 }; + static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); @@ -11407,7 +11909,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. - const int ShufMask[] = { 0, 4, 2, 6 }; + static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } @@ -11453,7 +11955,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { EVT EltTy = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Lower sdiv X, pow2-const. BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); @@ -11461,9 +11963,11 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); APInt SplatValue, SplatUndef; - unsigned MinSplatBits; + unsigned SplatBitSize; bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs) || + EltTy.getSizeInBits() < SplatBitSize) return SDValue(); if ((SplatValue != 0) && @@ -11491,16 +11995,13 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { - +static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - if (!Subtarget->hasSSE2()) - return SDValue(); - // Optimize shl/srl/sra with constant shift amount. if (isSplatVector(Amt.getNode())) { SDValue SclrAmt = Amt->getOperand(0); @@ -11611,6 +12112,224 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } } + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + unsigned RatioInLog2 = Log2_32_Ceil(Ratio); + uint64_t ShiftAmt = 0; + for (unsigned i = 0; i != Ratio; ++i) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); + } + // Check remaining shift amounts. + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = + dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + return SDValue(); +} + +static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasInt256() && + ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v8i32 || VT == MVT::v16i16))) { + SDValue BaseShAmt; + EVT EltVT = VT.getVectorElementType(); + + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned i, j; + for (i = 0; i != NumElts; ++i) { + if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) + continue; + break; + } + for (j = i; j != NumElts; ++j) { + SDValue Arg = Amt.getOperand(j); + if (Arg.getOpcode() == ISD::UNDEF) continue; + if (Arg != Amt.getOperand(i)) + break; + } + if (i != NumElts && j == NumElts) + BaseShAmt = Amt.getOperand(i); + } else { + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) + Amt = Amt.getOperand(0); + if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && + cast<ShuffleVectorSDNode>(Amt)->isSplat()) { + SDValue InVec = Amt.getOperand(0); + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = InVec.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { + unsigned SplatIdx = + cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); + if (C->getZExtValue() == SplatIdx) + BaseShAmt = InVec.getOperand(1); + } + } + if (BaseShAmt.getNode() == 0) + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, + DAG.getIntPtrConstant(0)); + } + } + + if (BaseShAmt.getNode()) { + if (EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); + + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRA: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); + } + } + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + std::vector<SDValue> Vals(Ratio); + for (unsigned i = 0; i != Ratio; ++i) + Vals[i] = Amt.getOperand(i); + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + for (unsigned j = 0; j != Ratio; ++j) + if (Vals[j] != Amt.getOperand(i + j)) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); + } + } + + return SDValue(); +} + +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + SDValue V; + + if (!Subtarget->hasSSE2()) + return SDValue(); + + V = LowerScalarImmediateShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + V = LowerScalarVariableShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + // AVX2 has VPSLLV/VPSRAV/VPSRLV. + if (Subtarget->hasInt256()) { + if (Op.getOpcode() == ISD::SRL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) + return Op; + } + // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); @@ -11717,7 +12436,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; unsigned Cond = 0; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: @@ -11784,7 +12503,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); EVT VT = Op.getValueType(); @@ -11827,62 +12546,31 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // fall through case MVT::v4i32: case MVT::v8i16: { - SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, - Op.getOperand(0), ShAmt, DAG); + // (sext (vzext x)) -> (vsext x) + SDValue Op0 = Op.getOperand(0); + SDValue Op00 = Op0.getOperand(0); + SDValue Tmp1; + // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. + if (Op0.getOpcode() == ISD::BITCAST && + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) + Tmp1 = LowerVectorIntExtend(Op00, DAG); + if (Tmp1.getNode()) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + + // If the above didn't work, then just use Shift-Left + Shift-Right. + Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG); return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); } } } -static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); - - // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. - // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { - SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. - Zero, - Chain - }; - SDNode *Res = - DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, - array_lengthof(Ops)); - return SDValue(Res, 0); - } - - unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); - if (!isDev) - return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - - unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); - - // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; - if (!Op1 && !Op2 && !Op3 && Op4) - return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); - - // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; - if (Op1 && !Op2 && !Op3 && !Op4) - return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); - - // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), - // (MFENCE)>; - return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); -} - static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); SynchronizationScope FenceScope = static_cast<SynchronizationScope>( @@ -11908,9 +12596,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, Zero, Chain }; - SDNode *Res = - DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, - array_lengthof(Ops)); + SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); return SDValue(Res, 0); } @@ -11921,7 +12607,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT T = Op.getValueType(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { @@ -11944,7 +12630,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, 5, T, MMO); + Ops, array_lengthof(Ops), T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; @@ -11955,7 +12641,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, @@ -11966,7 +12652,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), rdx.getValue(1) }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { @@ -11991,7 +12677,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT T = Node->getValueType(0); SDValue negOp = DAG.getNode(ISD::SUB, dl, T, DAG.getConstant(0, T), Node->getOperand(2)); @@ -12007,7 +12693,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); // Convert seq_cst store -> xchg @@ -12050,9 +12736,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { } if (!ExtraOp) - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } @@ -12060,8 +12746,9 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two XMM registers. - DebugLoc dl = Op.getDebugLoc(); + // which returns the values as { float, float } (in XMM0) or + // { double, double } (which is returned in XMM0, XMM1). + SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); @@ -12075,14 +12762,16 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Entry.isZExt = false; Args.push_back(Entry); + bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const char *LibcallName = (ArgVT == MVT::f64) - ? "__sincos_stret" : "__sincosf_stret"; + const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); - StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + Type *RetTy = isF64 + ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + : (Type*)VectorType::get(ArgTy, 4); TargetLowering:: CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, false, false, false, false, 0, @@ -12090,7 +12779,18 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { /*doesNotRet=*/false, /*isReturnValueUsed*/true, Callee, Args, DAG, dl); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; + + if (isF64) + // Returned in xmm0 and xmm1. + return CallResult.first; + + // Returned in bits 0:31 and 32:64 xmm0. + SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(0)); + SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(1)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// LowerOperation - Provide custom lowering hooks for some operations. @@ -12099,7 +12799,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); @@ -12182,7 +12881,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { static void ReplaceATOMIC_LOAD(SDNode *Node, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) { - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); // Convert wide load -> cmpxchg8b/cmpxchg16b @@ -12203,7 +12902,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, static void ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG, unsigned NewOp) { - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); @@ -12216,7 +12915,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, SDValue Ops[] = { Chain, In1, In2L, In2H }; SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, cast<MemSDNode>(Node)->getMemOperand()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); @@ -12228,7 +12927,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG) const { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: @@ -12296,7 +12995,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, eax.getValue(2)); // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { eax, edx }; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, + array_lengthof(Ops))); Results.push_back(edx.getValue(1)); return; } @@ -12335,7 +13035,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, - Ops, 3, T, MMO); + Ops, array_lengthof(Ops), T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -12411,6 +13111,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FSRL: return "X86ISD::FSRL"; @@ -12534,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; @@ -12547,6 +13249,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -12555,6 +13258,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::XTEST: return "X86ISD::XTEST"; } } @@ -12620,6 +13324,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return NumBits1 > NumBits2; } +bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } @@ -12671,6 +13389,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool +X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + return false; + + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); @@ -14137,12 +14876,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, } else { // __chkstk(MSVCRT): does not update stack pointer. // Clobbers R10, R11 and EFLAGS. - // FIXME: RAX(allocated size) might be reused and not killed. BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) .addExternalSymbol("__chkstk") .addReg(X86::RAX, RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to subtracted from RSP. + // RAX has the offset to be subtracted from RSP. BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(X86::RAX); @@ -14334,6 +15072,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); + + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -14379,6 +15120,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -14779,7 +15522,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -14820,7 +15563,8 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + array_lengthof(Ops), Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), @@ -14874,7 +15618,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. @@ -14993,7 +15737,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); @@ -15020,7 +15764,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), N->getValueType(0), InputVector.getNode()->getOperand(0)); @@ -15065,7 +15809,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Ok, we've now decided to do the transformation. - DebugLoc dl = InputVector.getDebugLoc(); + SDLoc dl(InputVector); // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); @@ -15176,7 +15920,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); @@ -15444,7 +16188,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; - Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); } @@ -15512,6 +16256,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. + if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SETCC) { + + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + EVT IntVT = Cond.getValueType(); + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!TValIsAllOnes && !FValIsAllZeros) { + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + + return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -15572,6 +16361,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue SetCC; const ConstantSDNode* C = 0; bool needOppositeCond = (CC == X86::COND_E); + bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast<ConstantSDNode>(Op1))) SetCC = Op2; @@ -15580,17 +16370,46 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { else // Quit if all operands are not constants. return SDValue(); - if (C->getZExtValue() == 1) + if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; - else if (C->getZExtValue() != 0) + checkAgainstTrue = true; + } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); - // Skip 'zext' node. - if (SetCC.getOpcode() == ISD::ZERO_EXTEND) - SetCC = SetCC.getOperand(0); + bool truncatedToBoolWithAnd = false; + // Skip (zext $x), (trunc $x), or (and $x, 1) node. + while (SetCC.getOpcode() == ISD::ZERO_EXTEND || + SetCC.getOpcode() == ISD::TRUNCATE || + SetCC.getOpcode() == ISD::AND) { + if (SetCC.getOpcode() == ISD::AND) { + int OpIdx = -1; + ConstantSDNode *CS; + if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && + CS->getZExtValue() == 1) + OpIdx = 1; + if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && + CS->getZExtValue() == 1) + OpIdx = 0; + if (OpIdx == -1) + break; + SetCC = SetCC.getOperand(OpIdx); + truncatedToBoolWithAnd = true; + } else + SetCC = SetCC.getOperand(0); + } switch (SetCC.getOpcode()) { + case X86ISD::SETCC_CARRY: + // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to + // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, + // i.e. it's a comparison against true but the result of SETCC_CARRY is not + // truncated to i1 using 'and'. + if (checkAgainstTrue && !truncatedToBoolWithAnd) + break; + assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && + "Invalid use of SETCC_CARRY!"); + // FALL THROUGH case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); @@ -15606,9 +16425,15 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); // Quit if false value is not a constant. if (!FVal) { - // A special case for rdrand, where 0 is set if false cond is found. SDValue Op = SetCC.getOperand(0); - if (Op.getOpcode() != X86ISD::RDRAND) + // Skip 'zext' or 'trunc' node. + if (Op.getOpcode() == ISD::ZERO_EXTEND || + Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + // A special case for rdrand/rdseed, where 0 is set if false cond is + // found. + if ((Op.getOpcode() != X86ISD::RDRAND && + Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. @@ -15639,7 +16464,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // If the flag operand isn't dead, don't touch this CMOV. if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) @@ -15842,7 +16667,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, } if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -15892,7 +16717,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { APInt ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); if (Mask != 0) - return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, + return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, DAG.getConstant(Mask, VT)); } } @@ -15908,136 +16733,61 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } } return SDValue(); } -/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts -/// when possible. -static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +/// \brief Returns a vector of 0s if the node in input is a vector logical +/// shift by a constant amount which is known to be bigger than or equal +/// to the vector element size in bits. +static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (N->getOpcode() == ISD::SHL) { - SDValue V = PerformSHLCombine(N, DAG); - if (V.getNode()) return V; - } - - // On X86 with SSE2 support, we can transform this to a vector shift if - // all elements are shifted by the same amount. We can't do this in legalize - // because the a constant vector is typically transformed to a constant pool - // so we have no knowledge of the shift amount. - if (!Subtarget->hasSSE2()) - return SDValue(); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && (!Subtarget->hasInt256() || (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); - SDValue ShAmtOp = N->getOperand(1); - EVT EltVT = VT.getVectorElementType(); - DebugLoc DL = N->getDebugLoc(); - SDValue BaseShAmt = SDValue(); - if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = ShAmtOp.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } - // Handle the case where the build_vector is all undef - // FIXME: Should DAG allow this? - if (i == NumElts) - return SDValue(); + SDValue Amt = N->getOperand(1); + SDLoc DL(N); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + APInt ShiftAmt = C->getAPIntValue(); + unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); - for (; i != NumElts; ++i) { - SDValue Arg = ShAmtOp.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - if (Arg != BaseShAmt) { - return SDValue(); - } + // SSE2/AVX2 logical shifts always return a vector of 0s + // if the shift amount is bigger than or equal to + // the element size. The constant shift amount will be + // encoded as a 8-bit immediate. + if (ShiftAmt.trunc(8).uge(MaxAmount)) + return getZeroVector(VT, Subtarget, DAG, DL); } - } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && - cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { - SDValue InVec = ShAmtOp.getOperand(0); - if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = InVec.getValueType().getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = InVec.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } - } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { - unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); - if (C->getZExtValue() == SplatIdx) - BaseShAmt = InVec.getOperand(1); - } - } - if (BaseShAmt.getNode() == 0) { - // Don't create instructions with illegal types after legalize - // types has run. - if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && - !DCI.isBeforeLegalize()) - return SDValue(); + } - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, - DAG.getIntPtrConstant(0)); - } - } else - return SDValue(); + return SDValue(); +} - // The shift amount is an i32. - if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); - else if (EltVT.bitsLT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); +/// PerformShiftCombine - Combine shifts. +static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } - // The shift amount is identical so we can do a vector shift. - SDValue ValOp = N->getOperand(0); - switch (N->getOpcode()) { - default: - llvm_unreachable("Unknown shift opcode!"); - case ISD::SHL: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); - } - case ISD::SRA: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v4i32: - case MVT::v8i16: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); - } - case ISD::SRL: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); - } + if (N->getOpcode() != ISD::SRA) { + // Try to fold this logical shift into a zero vector. + SDValue V = performShiftToAllZeros(N, DAG, Subtarget); + if (V.getNode()) return V; } + + return SDValue(); } // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) @@ -16055,7 +16805,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); SDValue CMP1 = N1->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) @@ -16174,7 +16924,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); - DebugLoc DL = Narrow->getDebugLoc(); + SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) @@ -16246,7 +16996,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Check LHS for neg if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && @@ -16280,7 +17030,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && @@ -16348,17 +17098,23 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b - if (Mask.getOpcode() != X86ISD::VSRAI) - return SDValue(); - - // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(1); - unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned SraAmt = ~0; + if (Mask.getOpcode() == ISD::SRA) { + SDValue Amt = Mask.getOperand(1); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) + SraAmt = C->getZExtValue(); + } + } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SDValue SraC = Mask.getOperand(1); + SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + } if ((SraAmt + 1) != EltBits) return SDValue(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. @@ -16407,7 +17163,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); @@ -16454,7 +17210,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CMOV. @@ -16504,7 +17260,7 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, // Create BLSMSK instructions by finding X ^ (X-1) SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && isAllOnes(N0.getOperand(1))) @@ -16524,15 +17280,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, LoadSDNode *Ld = cast<LoadSDNode>(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); - DebugLoc dl = Ld->getDebugLoc(); + SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); + // On Sandybridge unaligned 256bit loads are inefficient. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8; - - // On Sandybridge unaligned 256bit loads are inefficient. + bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; if (RegVT.is256BitVector() && !Subtarget->hasInt256() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); @@ -16552,7 +17307,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), - std::max(Alignment/2U, 1U)); + std::min(16U, Alignment)); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); @@ -16720,16 +17475,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, StoreSDNode *St = cast<StoreSDNode>(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); - DebugLoc dl = St->getDebugLoc(); + SDLoc dl(St); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8; // If we are saving a concatenation of two XMM registers, perform two stores. // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; if (VT.is256BitVector() && !Subtarget->hasInt256() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); @@ -16749,7 +17504,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), - std::max(Alignment/2U, 1U)); + std::min(16U, Alignment)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } @@ -16883,8 +17638,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); - DebugLoc LdDL = Ld->getDebugLoc(); - DebugLoc StDL = N->getDebugLoc(); + SDLoc LdDL(Ld); + SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. @@ -16977,7 +17732,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; - EVT VT = LHS.getValueType(); + MVT VT = LHS.getValueType().getSimpleVT(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); @@ -17048,23 +17803,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != NumElts; ++i) { - int LIdx = LMask[i], RIdx = RMask[i]; - - // Ignore any UNDEF components. - if (LIdx < 0 || RIdx < 0 || - (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || - (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) - continue; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + int LIdx = LMask[i+l], RIdx = RMask[i+l]; + + // Ignore any UNDEF components. + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) + continue; - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs - unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; - int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + unsigned Src = (i/HalfLaneElts); // each lane is split between srcs + int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) + return false; + } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. @@ -17083,7 +17839,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) - return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); + return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); return SDValue(); } @@ -17098,7 +17854,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) - return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); + return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); return SDValue(); } @@ -17135,7 +17891,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } - return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } @@ -17152,6 +17908,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { + // FANDN(x, 0.0) -> 0.0 + // FANDN(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + return SDValue(); +} + static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { @@ -17179,12 +17948,12 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } return SDValue(); } -static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) @@ -17193,7 +17962,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right @@ -17204,14 +17973,14 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); - // EXTLOAD has a better solution on AVX2, + // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { - SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } @@ -17240,7 +18009,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. @@ -17285,7 +18054,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -17323,17 +18092,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } return SDValue(); @@ -17342,7 +18111,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { +static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, MVT::i8, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), @@ -17353,7 +18122,7 @@ static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -17367,7 +18136,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); @@ -17397,7 +18166,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); @@ -17422,7 +18191,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); @@ -17457,7 +18226,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, @@ -17476,7 +18245,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, // (sub (sete X, 0), Y) -> sbb 0, Y // (sub (setne X, 0), Y) -> adc -1, Y static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Look through ZExts. SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); @@ -17522,7 +18291,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } @@ -17542,10 +18311,10 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, isa<ConstantSDNode>(Op1.getOperand(1))) { APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); - SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, + SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, VT)); - return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, + return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue()+1, VT)); } } @@ -17555,7 +18324,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } @@ -17572,7 +18341,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, if (In.getOpcode() != X86ISD::VZEXT) return SDValue(); - return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), In.getOperand(0)); } @@ -17606,6 +18375,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -18132,7 +18902,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, getTargetMachine()))) return; - Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), GA->getValueType(0), Offset); break; } @@ -18147,7 +18917,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass*> X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { @@ -18214,7 +18984,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget->hasSSE1()) break; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: @@ -18239,6 +19009,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + return std::make_pair(0U, &X86::VR512RegClass); } break; } @@ -18349,7 +19124,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } else if (Res.second == &X86::FR32RegClass || Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass) { + Res.second == &X86::VR128RegClass || + Res.second == &X86::VR256RegClass || + Res.second == &X86::FR32XRegClass || + Res.second == &X86::FR64XRegClass || + Res.second == &X86::VR128XRegClass || + Res.second == &X86::VR256XRegClass || + Res.second == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -18363,6 +19144,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; + else if (X86::VR512RegClass.hasType(VT)) + Res.second = &X86::VR512RegClass; } return Res; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index da1dad0..2703274 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -53,6 +53,10 @@ namespace llvm { /// to X86::XORPS or X86::XORPD. FXOR, + /// FAND - Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + /// FSRL - Bitwise logical right shift of floating point values. These /// corresponds to X86::PSRLDQ. FSRL, @@ -290,6 +294,10 @@ namespace llvm { // TESTP - Vector packed fp sign bitwise comparisons TESTP, + // OR/AND test for masks + KORTEST, + KTEST, + // Several flavors of instructions with vector shuffle behaviors. PALIGNR, PSHUFD, @@ -313,6 +321,8 @@ namespace llvm { VPERMI, VPERM2X128, VBROADCAST, + // masked broadcast + VBROADCASTM, // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, @@ -356,10 +366,17 @@ namespace llvm { // RDRAND - Get a random integer and indicate whether it is valid in CF. RDRAND, + // RDSEED - Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + // PCMP*STRI PCMPISTRI, PCMPESTRI, + // XTEST - Test if in transactional execution. + XTEST, + // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - // Atomic 64-bit binary operations. @@ -427,25 +444,45 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isVEXTRACTF128Index - Return true if the specified + /// isVEXTRACT128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. + bool isVEXTRACT128Index(SDNode *N); + + /// isVINSERT128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128, VINSERTI128 instructions. + bool isVINSERT128Index(SDNode *N); + + /// isVEXTRACT256Index - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is - /// suitable for input to VEXTRACTF128. - bool isVEXTRACTF128Index(SDNode *N); + /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. + bool isVEXTRACT256Index(SDNode *N); - /// isVINSERTF128Index - Return true if the specified + /// isVINSERT256Index - Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is - /// suitable for input to VINSERTF128. - bool isVINSERTF128Index(SDNode *N); + /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. + bool isVINSERT256Index(SDNode *N); - /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// getExtractVEXTRACT128Immediate - Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index - /// with VEXTRACTF128 instructions. - unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + /// with VEXTRACTF128, VEXTRACTI128 instructions. + unsigned getExtractVEXTRACT128Immediate(SDNode *N); - /// getInsertVINSERTF128Immediate - Return the appropriate + /// getInsertVINSERT128Immediate - Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index - /// with VINSERTF128 instructions. - unsigned getInsertVINSERTF128Immediate(SDNode *N); + /// with VINSERTF128, VINSERT128 instructions. + unsigned getInsertVINSERT128Immediate(SDNode *N); + + /// getExtractVEXTRACT256Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. + unsigned getExtractVEXTRACT256Immediate(SDNode *N); + + /// getInsertVINSERT256Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF64x4, VINSERTI64x4 instructions. + unsigned getInsertVINSERT256Immediate(SDNode *N); /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. @@ -504,7 +541,7 @@ namespace llvm { /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. virtual EVT - getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const; @@ -556,7 +593,7 @@ namespace llvm { virtual const char *getTargetNodeName(unsigned Opcode) const; /// getSetCCResultType - Return the value type to use for ISD::SETCC. - virtual EVT getSetCCResultType(EVT VT) const; + virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; /// computeMaskedBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the @@ -603,7 +640,7 @@ namespace llvm { /// error, this returns a register number of 0. std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const; + MVT VT) const; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. @@ -627,6 +664,8 @@ namespace llvm { virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; virtual bool isTruncateFree(EVT VT1, EVT VT2) const; + virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const; + /// isZExtFree - Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in @@ -639,11 +678,11 @@ namespace llvm { virtual bool isZExtFree(EVT VT1, EVT VT2) const; virtual bool isZExtFree(SDValue Val, EVT VT2) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; /// isNarrowingProfitable - Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow @@ -716,6 +755,9 @@ namespace llvm { SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; + /// \brief Reset the operation actions based on target options. + virtual void resetOperationActions(); + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(MVT VT) const; @@ -724,9 +766,12 @@ namespace llvm { /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - const X86RegisterInfo *RegInfo; const DataLayout *TD; + /// Used to store the TargetOptions so that we don't waste time resetting + /// the operation actions unless we have to. + TargetOptions TO; + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. @@ -746,16 +791,16 @@ namespace llvm { SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &ArgInfo, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; @@ -777,7 +822,7 @@ namespace llvm { bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, - int FPDiff, DebugLoc dl) const; + int FPDiff, SDLoc dl) const; unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG) const; @@ -786,15 +831,16 @@ namespace llvm { bool isSigned, bool isReplace) const; - SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; @@ -808,7 +854,9 @@ namespace llvm { SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; @@ -816,7 +864,7 @@ namespace llvm { SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -851,7 +899,7 @@ namespace llvm { LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue LowerCall(CallLoweringInfo &CLI, @@ -862,7 +910,7 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const; diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index bb362f5..ba1aede 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -84,13 +84,16 @@ defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; -def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>; +def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", + [(int_x86_mmx_femms)]>; -def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr), - "prefetch\t$addr", []>; +def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), + "prefetch\t$addr", + [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>; -def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr), - "prefetchw\t$addr", []>; +def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB, + Requires<[HasPrefetchW]>; // "3DNowA" instructions defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td new file mode 100644 index 0000000..8abae14 --- /dev/null +++ b/lib/Target/X86/X86InstrAVX512.td @@ -0,0 +1,715 @@ +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX512] in { + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + + def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion + def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - VECTOR INSERT +// +// -- 32x8 form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; +} + +// -- 64x4 fp form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in { +def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} +// -- 32x4 integer form -- +let neverHasSideEffects = 1 in { +def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i128mem:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +} + +let neverHasSideEffects = 1 in { +// -- 64x4 form -- +def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), + (bc_v8i32 (loadv4i64 addr:$src2)), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +// vinsertps - insert f32 to XMM +def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + EVEX_4V; +def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 VECTOR EXTRACT +//--- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +// -- 32x4 form -- +def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), + (ins f256mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +let neverHasSideEffects = 1 in { +// -- 32x4 form -- +def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), + (ins i256mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v4f32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), + (v4i32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v2f64 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v2i64 (VEXTRACTI32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + + +def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v8f32 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), + (v8i32 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v4f64 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v4i64 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +// A 256-bit subvector extract from the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; +def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; +def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; +def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; + +// zmm -> xmm +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; + + +// A 128-bit subvector insert to the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; + +def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; + +// vextractps - extract 32 bits from XMM +def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), + (ins VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + EVEX; + +def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), + (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), + addr:$dst)]>, EVEX; + +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- +multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DestRC, + RegisterClass SrcRC, X86MemOperand x86memop> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX; +} +let ExeDomain = SSEPackedSingle in { + defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, + VR128X, f32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +} + +let ExeDomain = SSEPackedDouble in { + defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512, + VR128X, f64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSZrm addr:$src)>; +def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDZrm addr:$src)>; + +multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, + RegisterClass SrcRC, RegisterClass KRC> { + def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V512; + def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), + (ins KRC:$mask, SrcRC:$src), + !strconcat(OpcodeStr, + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + []>, EVEX, EVEX_V512, EVEX_KZ; +} + +defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; +defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, + VEX_W; + +def : Pat <(v16i32 (X86vzext VK16WM:$mask)), + (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + +def : Pat <(v8i64 (X86vzext VK8WM:$mask)), + (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + +def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), + (VPBROADCASTDrZrr GR32:$src)>; +def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), + (VPBROADCASTQrZrr GR64:$src)>; + +multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, + RegisterClass KRC> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; + def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, + VR128X:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, + EVEX, EVEX_KZ; + def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; + def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, + (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; +} + +defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, + loadi32, VR512, v16i32, v4i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, + loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VT1>; + +def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), + (VBROADCASTSSZrr VR128X:$src)>; +def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), + (VBROADCASTSDZrr VR128X:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), + (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; +def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), + (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), + (EXTRACT_SUBREG + (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + addr:$src)), sub_ymm)>; +} +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DstRC, RegisterClass KRC, + ValueType OpVT, ValueType SrcVT> { +def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, + VK16, v16i32, v16i1>, EVEX_V512; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, + VK8, v8i64, v8i1>, EVEX_V512, VEX_W; + +// Mask register copy, including +// - copy between mask registers +// - load/store mask registers +// - copy from GPR to mask register and vice versa +// +multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, + string OpcodeStr, RegisterClass KRC, + ValueType vt, X86MemOperand x86memop> { + let neverHasSideEffects = 1 in { + def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + let mayLoad = 1 in + def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vt (load addr:$src)))]>; + let mayStore = 1 in + def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, + string OpcodeStr, + RegisterClass KRC, RegisterClass GRC> { + let neverHasSideEffects = 1 in { + def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +let Predicates = [HasAVX512] in { + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, + VEX, TB; + defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, + VEX, TB; +} + +let Predicates = [HasAVX512] in { + // GR16 from/to 16-bit mask + def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; + def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; + + // Store kreg in memory + def : Pat<(store (v16i1 VK16:$src), addr:$dst), + (KMOVWmk addr:$dst, VK16:$src)>; + + def : Pat<(store (v8i1 VK8:$src), addr:$dst), + (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>; +} +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + // GR from/to 8-bit mask without native support + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VK8)>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit)>; +} + +// Mask unary operation +// - KNOT +multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (OpNode KRC:$src))]>; +} + +multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; + +def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; + +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +def : Pat<(not VK8:$src), + (COPY_TO_REGCLASS + (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; + +// Mask binary operation +// - KADD, KAND, KANDN, KOR, KXNOR, KXOR +multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX_4V, VEX_L, TB; +} + +def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; +def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; + +let isCommutable = 1 in { + defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>; + defm KAND : avx512_mask_binop_w<0x41, "kand", and>; + let isCommutable = 0 in + defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; + defm KOR : avx512_mask_binop_w<0x45, "kor", or>; + defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>; + defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; +} + +multiclass avx512_mask_binop_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK16:$src1, VK16:$src2), + (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>; +} + +defm : avx512_mask_binop_int<"kadd", "KADD">; +defm : avx512_mask_binop_int<"kand", "KAND">; +defm : avx512_mask_binop_int<"kandn", "KANDN">; +defm : avx512_mask_binop_int<"kor", "KOR">; +defm : avx512_mask_binop_int<"kxnor", "KXNOR">; +defm : avx512_mask_binop_int<"kxor", "KXOR">; +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { + let Predicates = [HasAVX512] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; +} + +defm : avx512_binop_pat<and, KANDWrr>; +defm : avx512_binop_pat<andn, KANDNWrr>; +defm : avx512_binop_pat<or, KORWrr>; +defm : avx512_binop_pat<xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, KXORWrr>; + +// Mask unpacking +multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, + RegisterClass KRC1, RegisterClass KRC2> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +} + +multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { + defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>, + VEX_4V, VEX_L, OpSize, TB; +} + +defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; + +multiclass avx512_mask_unpck_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK8:$src1, VK8:$src2), + (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>; +} + +defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +// Mask bit testing +multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512], Defs = [EFLAGS] in + def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>; + +// Mask shift +multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512] in + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), + !strconcat(OpcodeStr, + "\t{$imm, $src, $dst|$dst, $src, $imm}"), + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; +} + +multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, + SDNode OpNode> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, OpSize, TA, VEX_W; +} + +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>; + +// Mask setting all 0s or 1s +multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { + let Predicates = [HasAVX512] in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + [(set KRC:$dst, (VT Val))]>; +} + +multiclass avx512_mask_setop_w<PatFrag Val> { + defm B : avx512_mask_setop<VK8, v8i1, Val>; + defm W : avx512_mask_setop<VK16, v16i1, Val>; +} + +defm KSET0 : avx512_mask_setop_w<immAllZerosV>; +defm KSET1 : avx512_mask_setop_w<immAllOnesV>; + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; +} +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; + +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), + (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f406416..9ce02ba 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -294,7 +294,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 // unsigned division/remainder let hasSideEffects = 1 in { // so that we don't speculatively execute let SchedRW = [WriteIDiv] in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH "div{b}\t$src", [], IIC_DIV8_REG>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in @@ -310,7 +310,7 @@ def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), } // SchedRW let mayLoad = 1 in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH "div{b}\t$src", [], IIC_DIV8_MEM>, SchedLoadReg<WriteIDivLd>; @@ -331,7 +331,7 @@ def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), // Signed division/remainder. let SchedRW = [WriteIDiv] in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH "idiv{b}\t$src", [], IIC_IDIV8>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in @@ -347,7 +347,7 @@ def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), } // SchedRW let mayLoad = 1 in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH "idiv{b}\t$src", [], IIC_IDIV8>, SchedLoadReg<WriteIDivLd>; @@ -932,7 +932,8 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, Format f, list<dag> pattern> : ITy<0x82, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> { + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + Sched<[WriteALULd, WriteRMW]> { let ImmT = Imm8; // Always 8-bit immediate. } @@ -959,18 +960,26 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.Imm8Operator:$src))]>; -// BinOpAI - Instructions like "add %eax, %eax, imm". +// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS. class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Register areg, string operands> : ITy<opcode, RawFrm, typeinfo, (outs), (ins typeinfo.ImmOperand:$src), - mnemonic, operands, []> { + mnemonic, operands, []>, Sched<[WriteALU]> { let ImmT = typeinfo.ImmEncoding; let Uses = [areg]; - let Defs = [areg]; + let Defs = [areg, EFLAGS]; let hasSideEffects = 0; } +// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define +// and use EFLAGS. +class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Uses = [areg, EFLAGS]; +} + /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is /// defined with "(set GPR:$dst, EFLAGS, (...". /// @@ -1029,16 +1038,16 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is @@ -1051,7 +1060,7 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, string mnemonic, Format RegMRM, Format MemMRM, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { - let Defs = [EFLAGS] in { + let Uses = [EFLAGS], Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { @@ -1100,16 +1109,16 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Uses = [EFLAGS], Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1167,16 +1176,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } @@ -1194,12 +1203,10 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, } // Arithmetic. -let Uses = [EFLAGS] in { - defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, - 1, 0>; - defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, - 0, 0>; -} +defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; +defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; let isCompare = 1 in { defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; @@ -1214,44 +1221,46 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), (X86cmp (and_su node:$lhs, node:$rhs), 0)>; -let isCompare = 1, Defs = [EFLAGS] in { - let isCommutable = 1 in { - def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; - def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; - def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; - def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; - } // isCommutable - - def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; - def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; - def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; - def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; - - def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; - def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; - def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; - def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; - - def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; - def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; - def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; - def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; +let isCompare = 1 in { + let Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; + } // isCommutable + + def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; + def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; + def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; + def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; + def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; + def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + + // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the + // register class is constrained to GR8_NOREX. + let isPseudo = 1 in + def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), + "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; + } // Defs = [EFLAGS] def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|RAX, $src}">; - - // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the - // register class is constrained to GR8_NOREX. - let isPseudo = 1 in - def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), - "", [], IIC_BIN_NONMEM>; -} + "{$src, %rax|rax, $src}">; +} // isCompare //===----------------------------------------------------------------------===// // ANDN Instruction @@ -1293,12 +1302,12 @@ let neverHasSideEffects = 1 in { let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>; let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>; } } @@ -1313,6 +1322,7 @@ let Predicates = [HasBMI2] in { // ADCX Instruction // let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { + let SchedRW = [WriteALU] in { def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8, OpSize; @@ -1320,12 +1330,13 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; + } // SchedRW - let mayLoad = 1 in { + let mayLoad = 1, SchedRW = [WriteALULd] in { def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize; - + def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; @@ -1336,6 +1347,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { // ADOX Instruction // let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { + let SchedRW = [WriteALU] in { def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; @@ -1343,12 +1355,13 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>; + } // SchedRW - let mayLoad = 1 in { + let mayLoad = 1, SchedRW = [WriteALULd] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; - + def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index 8f2d0a1..a967a4d 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -16,7 +16,7 @@ // SetCC instructions. multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", - isCommutable = 1 in { + isCommutable = 1, SchedRW = [WriteALU] in { def NAME#16rr : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), @@ -37,7 +37,8 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { IIC_CMOV32_RR>, TB; } - let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in { + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [WriteALULd, ReadAfterLd] in { def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), @@ -83,11 +84,11 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { def r : I<opc, MRM0r, (outs GR8:$dst), (ins), !strconcat(Mnemonic, "\t$dst"), [(set GR8:$dst, (X86setcc OpNode, EFLAGS))], - IIC_SET_R>, TB; + IIC_SET_R>, TB, Sched<[WriteALU]>; def m : I<opc, MRM0m, (outs), (ins i8mem:$dst), !strconcat(Mnemonic, "\t$dst"), [(store (X86setcc OpNode, EFLAGS), addr:$dst)], - IIC_SET_M>, TB; + IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>; } // Uses = [EFLAGS] } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 734e598..8969946 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), // The MSVC runtime contains an _ftol2 routine for converting floating-point // to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. No -// other registers (aside from flags) are touched. +// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is +// used as a temporary register. No other registers (aside from flags) are +// touched. // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 // variant is unnecessary. -let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { +let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), "# win32 fptoui", [(X86WinFTOL RFP32:$src)]>, @@ -149,11 +150,12 @@ let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { //===----------------------------------------------------------------------===// // EH Pseudo Instructions // +let SchedRW = [WriteSystem] in { let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), "ret\t#eh_return, addr: $addr", - [(X86ehret GR32:$addr)], IIC_RET>; + [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; } @@ -161,7 +163,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), "ret\t#eh_return, addr: $addr", - [(X86ehret GR64:$addr)], IIC_RET>; + [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; } @@ -186,6 +188,7 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Requires<[In64BitMode]>; } } +} // SchedRW let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), @@ -214,50 +217,41 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias Instructions //===----------------------------------------------------------------------===// -// Alias instructions that map movr0 to xor. +// Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. // FIXME: Set encoding to pseudo. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1 in { -def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", - [(set GR8:$dst, 0)], IIC_ALU_NONMEM>; - -// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller -// encoding and avoids a partial-register update sometimes, but doing so -// at isel time interferes with rematerialization in the current register -// allocator. For now, this is rewritten when the instruction is lowered -// to an MCInst. -def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins), - "", - [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize; - -// FIXME: Set encoding to pseudo. + isCodeGenOnly = 1 in def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, 0)], IIC_ALU_NONMEM>; + [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; + +// Other widths can also make use of the 32-bit xor, which may have a smaller +// encoding and avoid partial register updates. +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { + let AddedComplexity = 20; } -// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a -// smaller encoding, but doing so at isel time interferes with rematerialization -// in the current register allocator. For now, this is rewritten when the -// instruction is lowered to an MCInst. -// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove -// when we have a better way to specify isel priority. -let Defs = [EFLAGS], isCodeGenOnly=1, - AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, 0)], IIC_ALU_NONMEM>; - // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1 in -def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), - "", [(set GR64:$dst, i64immZExt32:$src)], - IIC_ALU_NONMEM>; + isCodeGenOnly = 1, neverHasSideEffects = 1 in +def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), + "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; + +// This 64-bit pseudo-move can be used for both a 64-bit constant that is +// actually the zero-extension of a 32-bit constant, and for labels in the +// x86-64 small code model. +def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; + +let AddedComplexity = 1 in +def : Pat<(i64 mov64imm32:$src), + (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; // Use sbb to materialize carry bit. -let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1 in { +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { // FIXME: These are pseudo ops that should be replaced with Pat<> patterns. // However, Pat<> can't replicate the destination reg into the inputs of the // result. @@ -320,6 +314,7 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), //===----------------------------------------------------------------------===// // String Pseudo Instructions // +let SchedRW = [WriteMicrocoded] in { let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", [(X86rep_movs i8)], IIC_REP_MOVS>, REP, @@ -382,6 +377,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { [(X86rep_stos i64)], IIC_REP_STOS>, REP, Requires<[In64BitMode]>; } +} // SchedRW //===----------------------------------------------------------------------===// // Thread Local Storage Instructions @@ -594,12 +590,13 @@ defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), "or{l}\t{$zero, $dst|$dst, $zero}", - [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK; + [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK, + Sched<[WriteALULd, WriteRMW]>; let hasSideEffects = 1 in def Int_MemBarrier : I<0, Pseudo, (outs), (ins), "#MEMBARRIER", - [(X86MemBarrier)]>; + [(X86MemBarrier)]>, Sched<[WriteLoad]>; // RegOpc corresponds to the mr version of the instruction // ImmOpc corresponds to the mi version of the instruction @@ -607,7 +604,8 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins), // ImmMod corresponds to the instruction format of the mi and mi8 versions multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, Format ImmMod, string mnemonic> { -let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, @@ -694,7 +692,8 @@ defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; // Optimized codegen when the non-memory output is not used. multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, string mnemonic> { -let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), !strconcat(mnemonic, "{b}\t$dst"), @@ -728,7 +727,7 @@ let isCodeGenOnly = 1 in { multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag, InstrItinClass itin8, InstrItinClass itin> { -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { let Defs = [AL, EFLAGS], Uses = [AL] in def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), @@ -748,14 +747,15 @@ let isCodeGenOnly = 1 in { } } -let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in { +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW] in { defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem, IIC_CMPX_LOCK_8B>; } let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b] in { + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", X86cas16, i128mem, IIC_CMPX_LOCK_16B>, REX_W; @@ -768,7 +768,8 @@ defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, InstrItinClass itin8, InstrItinClass itin> { - let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), @@ -932,20 +933,6 @@ def : Pat<(i64 (X86Wrapper texternalsym:$dst)), def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; -// In static codegen with small code model, we can get the address of a label -// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of -// the MOV64ri64i32 should accept these. -def : Pat<(i64 (X86Wrapper tconstpool :$dst)), - (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tjumptable :$dst)), - (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), - (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper texternalsym:$dst)), - (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), - (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>; - // In kernel code model, we can get the address of a label // into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of // the MOV64ri32 should accept these. @@ -990,9 +977,6 @@ def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), // This corresponds to add $foo@tpoff, %rax def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; -// This corresponds to mov foo@tpoff(%rbx), %eax -def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))), - (MOV64rm tglobaltlsaddr :$dst)>; // Direct PC relative function call for small code model. 32-bit displacement @@ -1112,7 +1096,8 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; -def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1126,14 +1111,16 @@ def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; -def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; -def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>; -def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>; // For other extloads, use subregs, since the high contents of the register are // defined after an extload. +def : Pat<(extloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; def : Pat<(extloadi64i32 addr:$src), - (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), - sub_32bit)>; + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. @@ -1145,8 +1132,10 @@ def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; def : Pat<(i32 (anyext GR16:$src)), (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; -def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>; -def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>; +def : Pat<(i64 (anyext GR8 :$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; def : Pat<(i64 (anyext GR32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; @@ -1192,7 +1181,8 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. -let AddedComplexity = 5 in { // Try this before the selecting to OR +// Try this before the selecting to OR. +let AddedComplexity = 5, SchedRW = [WriteALU] in { let isConvertibleToThreeAddress = 1, Constraints = "$src1 = $dst", Defs = [EFLAGS] in { @@ -1239,7 +1229,7 @@ def ADD64ri32_DB : I<0, Pseudo, [(set GR64:$dst, (or_is_add GR64:$src1, i64immSExt32:$src2))]>; } -} // AddedComplexity +} // AddedComplexity, SchedRW //===----------------------------------------------------------------------===// @@ -1310,13 +1300,19 @@ def : Pat<(and GR16:$src1, 0xff), // r & (2^32-1) ==> movz def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), - (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; + (SUBREG_TO_REG (i64 0), + (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), + sub_32bit)>; // r & (2^16-1) ==> movz def : Pat<(and GR64:$src, 0xffff), - (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>; + (SUBREG_TO_REG (i64 0), + (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), + sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR64:$src, 0xff), - (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>; + (SUBREG_TO_REG (i64 0), + (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), + sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index bfe9541..0e69651 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -20,7 +20,7 @@ // The X86retflag return instructions are variadic because we may add ST0 and // ST1 arguments when returning values on the x87 stack. let isTerminator = 1, isReturn = 1, isBarrier = 1, - hasCtrlDep = 1, FPForm = SpecialFP in { + hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in { def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), "ret", [(X86retflag 0)], IIC_RET>; @@ -46,7 +46,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, } // Unconditional branches. -let isBarrier = 1, isBranch = 1, isTerminator = 1 in { +let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), @@ -58,7 +58,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in { } // Conditional Branches. -let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in { +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], IIC_Jcc>; @@ -85,7 +85,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; // jcx/jecx/jrcx instructions. -let isBranch = 1, isTerminator = 1 in { +let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { // These are the 32-bit versions of this instruction for the asmparser. In // 32-bit mode, the address size prefix is jcxz and the unprefixed version is // jecxz. @@ -110,36 +110,46 @@ let isBranch = 1, isTerminator = 1 in { // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", - [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>; + [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>, + Sched<[WriteJump]>; def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", - [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>; + [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, + Requires<[In32BitMode]>, Sched<[WriteJumpLd]>; def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", - [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>; + [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>, + Sched<[WriteJump]>; def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", - [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>; + [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, + Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), - "ljmp{w}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>, OpSize; + "ljmp{w}\t{$seg, $off|$off, $seg}", [], + IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>; def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), - "ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>; + "ljmp{l}\t{$seg, $off|$off, $seg}", [], + IIC_JMP_FAR_PTR>, Sched<[WriteJump]>; def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), - "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>; + "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, + Sched<[WriteJump]>; def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), - "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize; + "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize, + Sched<[WriteJumpLd]>; def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), - "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>; + "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, + Sched<[WriteJumpLd]>; } // Loop instructions - +let SchedRW = [WriteJump] in { def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>; def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>; def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>; +} //===----------------------------------------------------------------------===// // Call Instructions... @@ -152,27 +162,32 @@ let isCall = 1 in let Uses = [ESP] in { def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, (outs), (ins i32imm_pcrel:$dst), - "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>; + "call{l}\t$dst", [], IIC_CALL_RI>, + Requires<[In32BitMode]>, Sched<[WriteJump]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, - Requires<[In32BitMode]>; + Requires<[In32BitMode]>, Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), - "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, - Requires<[In32BitMode]>; + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], + IIC_CALL_MEM>, + Requires<[In32BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>; def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), "lcall{w}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>, OpSize; + IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>; def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), "lcall{l}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>; + IIC_CALL_FAR_PTR>, Sched<[WriteJump]>; def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), - "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize; + "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize, + Sched<[WriteJumpLd]>; def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), - "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>; + "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, + Sched<[WriteJumpLd]>; // callw for 16 bit code for the assembler. let isAsmParserOnly = 1 in @@ -185,7 +200,7 @@ let isCall = 1 in // Tail call stuff. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1 in + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in let Uses = [ESP] in { def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset), []>; @@ -216,7 +231,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // RSP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. Uses for argument // registers are added manually. -let isCall = 1, Uses = [RSP] in { +let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { // NOTE: this pattern doesn't match "X86call imm", because we do not know // that the offset between an arbitrary immediate and the call will fit in // the 32-bit pcrel field that we have. @@ -231,7 +246,7 @@ let isCall = 1, Uses = [RSP] in { def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, - Requires<[In64BitMode]>; + Requires<[In64BitMode,FavorMemIndirectCall]>; def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; @@ -245,13 +260,12 @@ let isCall = 1, isCodeGenOnly = 1 in def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), "call{q}\t$dst", [], IIC_CALL_RI>, - Requires<[IsWin64]>; + Requires<[IsWin64]>, Sched<[WriteJump]>; } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1 in - let Uses = [RSP], - usesCustomInserter = 1 in { + isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, + SchedRW = [WriteJump] in { def TCRETURNdi64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset), []>; diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 2eb454d..28954c6 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -42,48 +42,54 @@ let neverHasSideEffects = 1 in { let neverHasSideEffects = 1 in { def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, - TB, OpSize; + TB, OpSize, Sched<[WriteALU]>; let mayLoad = 1 in def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, - TB, OpSize; + TB, OpSize, Sched<[WriteALULd]>; } // neverHasSideEffects = 1 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB; + [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB; + [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB, + Sched<[WriteALULd]>; def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB; + [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, - TB; + TB, Sched<[WriteALULd]>; let neverHasSideEffects = 1 in { def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, - TB, OpSize; + TB, OpSize, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, - TB, OpSize; + TB, OpSize, Sched<[WriteALULd]>; } // neverHasSideEffects = 1 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB; + [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB; + [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB, + Sched<[WriteALULd]>; def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), "movz{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB; + [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB, + Sched<[WriteALU]>; def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movz{wl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>, - TB; + TB, Sched<[WriteALULd]>; // These are the same as the regular MOVZX32rr8 and MOVZX32rm8 // except that they use GR32_NOREX for the output operand register class @@ -92,12 +98,12 @@ let neverHasSideEffects = 1, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [], IIC_MOVZX>, TB; + [], IIC_MOVZX>, TB, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [], IIC_MOVZX>, TB; + [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register @@ -106,68 +112,61 @@ def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, // were generalized, this would require a special register class. def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), "movs{bq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB; + [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), "movs{bq|x}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>, - TB; + TB, Sched<[WriteALULd]>; def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), "movs{wq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB; + [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "movs{wq|x}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>, - TB; + TB, Sched<[WriteALULd]>; def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>; + [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>, + Sched<[WriteALU]>; def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>; + [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>, + Sched<[WriteALULd]>; // movzbq and movzwq encodings for the disassembler def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB; + TB, Sched<[WriteALU]>; def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB; + TB, Sched<[WriteALULd]>; def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB; + TB, Sched<[WriteALU]>; def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB; - -// FIXME: These should be Pat patterns. -let isCodeGenOnly = 1 in { - -// Use movzbl instead of movzbq when the destination is a register; it's -// equivalent due to implicit zero-extending, and it has a smaller encoding. -def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), - "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB; -def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), - "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>, - TB; -// Use movzwl instead of movzwq when the destination is a register; it's -// equivalent due to implicit zero-extending, and it has a smaller encoding. -def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB; -def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "", [(set GR64:$dst, (zextloadi64i16 addr:$src))], - IIC_MOVZX>, TB; - -// There's no movzlq instruction, but movl can be used for this purpose, using -// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero -// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit -// zero-extension, however this isn't possible when the 32-bit value is -// defined by a truncate or is copied from something where the high bits aren't -// necessarily all zero. In such cases, we fall back to these explicit zext -// instructions. -def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src), - "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>; -def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), - "", [(set GR64:$dst, (zextloadi64i32 addr:$src))], - IIC_MOVZX>; -} - + TB, Sched<[WriteALULd]>; + +// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a +// 32-bit register. +def : Pat<(i64 (zext GR8:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>; +def : Pat<(zextloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +def : Pat<(i64 (zext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>; +def : Pat<(zextloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; + +// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a +// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible +// when the 32-bit value is defined by a truncate or is copied from something +// where the high bits aren't necessarily all zero. In such cases, we fall back +// to these explicit zext instructions. +def : Pat<(i64 (zext GR32:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>; +def : Pat<(i64 (zextloadi64i32 addr:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 568726e..7c37888 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm> // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; -def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">; def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; -def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">; def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; -def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; -def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">; def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; -def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">; def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; -def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; @@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovb\t{$op, %st(0)|st(0), $op}">, DA; def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA; def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmove\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmove\t{$op, %st(0)|st(0), $op}">, DA; def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA; + "fcmovu\t{$op, %st(0)|st(0), $op}">, DA; def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovne\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB; } // Predicates = [HasCMov] // Floating point loads & stores. @@ -422,7 +422,7 @@ def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } -let mayLoad = 1 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", IIC_FLD>; def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src", @@ -436,7 +436,7 @@ def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src", def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src", IIC_FILD>; } -let mayStore = 1 in { +let mayStore = 1, SchedRW = [WriteStore] in { def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst", IIC_FST>; def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst", @@ -481,7 +481,7 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; } // Predicates = [HasSSE3] -let mayStore = 1 in { +let mayStore = 1, SchedRW = [WriteStore] in { def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", IIC_FST>; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", @@ -491,6 +491,7 @@ def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), } // FP Stack manipulation instructions. +let SchedRW = [WriteMove] in { def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>, D9; def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op", @@ -499,6 +500,7 @@ def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>, DD; def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>, D9; +} // Floating point constant loads. let isReMaterializable = 1 in { @@ -516,19 +518,23 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } +let SchedRW = [WriteZero] in { def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9; def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9; - +} // Floating point compares. +let SchedRW = [WriteFAdd] in { def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; +} // SchedRW } // Defs = [FPSW] +let SchedRW = [WriteFAdd] in { // CC = ST(0) cmp ST(i) let Defs = [EFLAGS, FPSW] in { def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, @@ -566,33 +572,38 @@ def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), "fcompi\t$reg", IIC_FCOMI>, DF; } +} // SchedRW // Floating point flag ops. +let SchedRW = [WriteALU] in { let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags - (outs), (ins), "fnstsw %ax", + (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF; def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; - +} // SchedRW let mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>; + (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>, + Sched<[WriteLoad]>; // FPU control instructions +let SchedRW = [WriteMicrocoded] in { let Defs = [FPSW] in def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB; def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg), "ffree\t$reg", IIC_FFREE>, DD; - // Clear exceptions let Defs = [FPSW] in def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB; +} // SchedRW // Operandless floating-point instructions for the disassembler. +let SchedRW = [WriteMicrocoded] in { def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9; @@ -627,6 +638,7 @@ def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W, Requires<[In64BitMode]>; +} // SchedRW //===----------------------------------------------------------------------===// // Non-Instruction Patterns diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 44e574d..64018b3 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -35,24 +35,27 @@ def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>; def MRM_C9 : Format<38>; -def MRM_E8 : Format<39>; -def MRM_F0 : Format<40>; -def MRM_F8 : Format<41>; -def MRM_F9 : Format<42>; +def MRM_CA : Format<39>; +def MRM_CB : Format<40>; +def MRM_E8 : Format<41>; +def MRM_F0 : Format<42>; def RawFrmImm8 : Format<43>; def RawFrmImm16 : Format<44>; -def MRM_D0 : Format<45>; -def MRM_D1 : Format<46>; -def MRM_D4 : Format<47>; -def MRM_D5 : Format<48>; -def MRM_D8 : Format<49>; -def MRM_D9 : Format<50>; -def MRM_DA : Format<51>; -def MRM_DB : Format<52>; -def MRM_DC : Format<53>; -def MRM_DD : Format<54>; -def MRM_DE : Format<55>; -def MRM_DF : Format<56>; +def MRM_F8 : Format<45>; +def MRM_F9 : Format<46>; +def MRM_D0 : Format<47>; +def MRM_D1 : Format<48>; +def MRM_D4 : Format<49>; +def MRM_D5 : Format<50>; +def MRM_D6 : Format<51>; +def MRM_D8 : Format<52>; +def MRM_D9 : Format<53>; +def MRM_DA : Format<54>; +def MRM_DB : Format<55>; +def MRM_DC : Format<56>; +def MRM_DD : Format<57>; +def MRM_DE : Format<58>; +def MRM_DF : Format<59>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -93,6 +96,20 @@ def SSEPackedSingle : Domain<1>; def SSEPackedDouble : Domain<2>; def SSEPackedInt : Domain<3>; +// Class specifying the vector form of the decompressed +// displacement of 8-bit. +class CD8VForm<bits<3> val> { + bits<3> Value = val; +} +def CD8VF : CD8VForm<0>; // v := VL +def CD8VH : CD8VForm<1>; // v := VL/2 +def CD8VQ : CD8VForm<2>; // v := VL/4 +def CD8VO : CD8VForm<3>; // v := VL/8 +def CD8VT1 : CD8VForm<4>; // v := 1 +def CD8VT2 : CD8VForm<5>; // v := 2 +def CD8VT4 : CD8VForm<6>; // v := 4 +def CD8VT8 : CD8VForm<7>; // v := 8 + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } @@ -129,6 +146,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; } class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } +class EVEX : VEX { bit hasEVEXPrefix = 1; } +class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; } +class EVEX_K { bit hasEVEX_K = 1; } +class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } +class EVEX_B { bit hasEVEX_B = 1; } +class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_CD8<int esize, CD8VForm form> { + bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00, + !if(!eq(esize, 16), 0b01, + !if(!eq(esize, 32), 0b10, + !if(!eq(esize, 64), 0b11, ?)))); + bits<3> EVEX_CD8V = form.Value; +} class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { bit hasXOP_Prefix = 1; } @@ -174,6 +204,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // to be encoded in a immediate field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit hasEVEXPrefix = 0; // Does this inst require EVEX form? + bit hasEVEX_K = 0; // Does this inst require masking? + bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? + bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? + bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? + bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size. + bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? @@ -197,9 +234,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{37} = hasVEX_i8ImmReg; let TSFlags{38} = hasVEX_L; let TSFlags{39} = ignoresVEX_L; - let TSFlags{40} = has3DNow0F0FOpcode; - let TSFlags{41} = hasMemOp4Prefix; - let TSFlags{42} = hasXOP_Prefix; + let TSFlags{40} = hasEVEXPrefix; + let TSFlags{41} = hasEVEX_K; + let TSFlags{42} = hasEVEX_Z; + let TSFlags{43} = hasEVEX_L2; + let TSFlags{44} = hasEVEX_B; + let TSFlags{46-45} = EVEX_CD8E; + let TSFlags{49-47} = EVEX_CD8V; + let TSFlags{50} = has3DNow0F0FOpcode; + let TSFlags{51} = hasMemOp4Prefix; + let TSFlags{52} = hasXOP_Prefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -208,47 +252,47 @@ class PseudoI<dag oops, dag iops, list<dag> pattern> } class I<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT, + list<dag> pattern, InstrItinClass itin = NoItinerary, Domain d = GenericDomain> : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT, + list<dag> pattern, InstrItinClass itin = NoItinerary, Domain d = GenericDomain> : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; @@ -257,12 +301,12 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, // FPStack Instruction Templates: // FPI - Floating Point Instruction template. class FPI<bits<8> o, Format F, dag outs, dag ins, string asm, - InstrItinClass itin = IIC_DEFAULT> + InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, [], itin> {} // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, - InstrItinClass itin = IIC_DEFAULT> + InstrItinClass itin = NoItinerary> : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { let FPForm = fp; let Pattern = pattern; @@ -275,27 +319,30 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, // Iseg32 - 16-bit segment selector, 32-bit offset class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } def __xs : XS; +def __xd : XD; // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], + !if(!eq(Prefix, __xd.Prefix), [UseSSE2], + !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -303,7 +350,7 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm, // SIi8 - SSE 1 & 2 scalar instructions class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); @@ -347,28 +394,28 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // PSI - SSE1 instructions with TB prefix. // PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. // VSSI - SSE1 instructions with XS prefix in AVX form. -// VPSI - SSE1 instructions with TB prefix in AVX form. +// VPSI - SSE1 instructions with TB prefix in AVX form, packed single. class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, Requires<[UseSSE1]>; class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, Requires<[UseSSE1]>; class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, Requires<[HasAVX]>; class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB, Requires<[HasAVX]>; @@ -378,52 +425,63 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. // S2SI - SSE2 instructions with XS prefix. // SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. -// PDI - SSE2 instructions with TB and OpSize prefixes. +// PDI - SSE2 instructions with TB and OpSize prefixes, packed double domain. // PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. -// VSDI - SSE2 instructions with XD prefix in AVX form. -// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. +// VSDI - SSE2 scalar instructions with XD prefix in AVX form. +// VPDI - SSE2 vector instructions with TB and OpSize prefixes in AVX form, +// packed double domain. +// VS2I - SSE2 scalar instructions with TB and OpSize prefixes in AVX form. +// S2I - SSE2 scalar instructions with TB and OpSize prefixes. // MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as // MMX operands. // MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as // MMX operands. class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>; class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[UseSSE2]>; class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[UseSSE2]>; class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, Requires<[HasAVX]>; class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, Requires<[HasAVX]>; class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; +class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB, + OpSize, Requires<[HasAVX]>; +class S2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, + OpSize, Requires<[UseSSE2]>; class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; // SSE3 Instruction Templates: @@ -433,15 +491,15 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // S3DI - SSE3 instructions with XD prefix. class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, Requires<[UseSSE3]>; class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, Requires<[UseSSE3]>; class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[UseSSE3]>; @@ -458,19 +516,19 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, // classes. They need to be enabled even if AVX is enabled. class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[UseSSSE3]>; class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[UseSSSE3]>; class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasSSSE3]>; class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSSE3]>; @@ -480,11 +538,11 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, // SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. // class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[UseSSE41]>; class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[UseSSE41]>; @@ -492,19 +550,19 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[UseSSE42]>; // SS42FI - SSE 4.2 instructions with T8XD prefix. // NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; // SS42AI = SSE 4.2 instructions with TA prefix class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[UseSSE42]>; @@ -514,11 +572,11 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, // AVX8I - AVX instructions with T8 and OpSize prefix. // AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8. class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, Requires<[HasAVX]>; class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasAVX]>; @@ -528,66 +586,134 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // AVX28I - AVX2 instructions with T8 and OpSize prefix. // AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8. class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, Requires<[HasAVX2]>; class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasAVX2]>; + +// AVX-512 Instruction Templates: +// Instructions introduced in AVX-512 (no SSE equivalent forms) +// +// AVX5128I - AVX-512 instructions with T8 and OpSize prefix. +// AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8. +// AVX512PDI - AVX-512 instructions with TB, OpSize, double packed. +// AVX512PSI - AVX-512 instructions with TB, single packed. +// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes. +// AVX512XSI - AVX-512 instructions with XS prefix, generic domain. +// AVX512BI - AVX-512 instructions with TB, OpSize, int packed domain. +// AVX512SI - AVX-512 scalar instructions with TB and OpSize prefixes. + +class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX512]>; +class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, + Requires<[HasAVX512]>; +class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, + Requires<[HasAVX512]>; +class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD, + Requires<[HasAVX512]>; +class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX512]>; +class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, + Requires<[HasAVX512]>; +class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, + OpSize, Requires<[HasAVX512]>; +class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, + Requires<[HasAVX512]>; +class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8, + OpSize, EVEX_4V, Requires<[HasAVX512]>; + // AES Instruction Templates: // // AES8I // These use the same encoding as the SSE4.2 T8 and TA encodings. class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasAES]>; class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasAES]>; // PCLMUL Instruction Templates class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasPCLMUL]>; class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>; // FMA3 Instruction Templates class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8, - OpSize, VEX_4V, Requires<[HasFMA]>; + OpSize, VEX_4V, FMASC, Requires<[HasFMA]>; // FMA4 Instruction Templates class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, TA, - OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>; + OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>; // XOP 2, 3 and 4 Operand Instruction Template class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XOP, XOP9, Requires<[HasXOP]>; // XOP 2, 3 and 4 Operand Instruction Templates with imm byte class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XOP, XOP8, Requires<[HasXOP]>; // XOP 5 operand instruction (VEX encoding!) class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag>pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; @@ -595,34 +721,47 @@ class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, // class RI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W { let Pattern = pattern; let CodeSize = 3; } +class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W; class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W; class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W; class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W; +class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W; +class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W; // MMX Instruction templates // @@ -635,23 +774,23 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, // MMXID - MMX instructions with XD prefix. // MMXIS - MMX instructions with XS prefix. class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>; class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>; class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>; class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 2a72fb6..0b51521 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -47,6 +47,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; @@ -136,6 +138,8 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, @@ -153,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; + def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; @@ -192,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; @@ -405,28 +412,54 @@ def BYTE_imm : SDNodeXForm<imm, [{ return getI32Imm(N->getZExtValue() >> 3); }]>; -// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index -// to VEXTRACTF128 imm. -def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index +// to VEXTRACTF128/VEXTRACTI128 imm. +def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT128Immediate(N)); +}]>; + +// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to +// VINSERTF128/VINSERTI128 imm. +def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT128Immediate(N)); }]>; -// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to -// VINSERTF128 imm. -def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index +// to VEXTRACTF64x4 imm. +def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT256Immediate(N)); }]>; -def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), +// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to +// VINSERTF64x4 imm. +def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT256Immediate(N)); +}]>; + +def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACT128Index(N); +}], EXTRACT_get_vextract128_imm>; + +def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERT128Index(N); +}], INSERT_get_vinsert128_imm>; + + +def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, node:$index), [{ - return X86::isVEXTRACTF128Index(N); -}], EXTRACT_get_vextractf128_imm>; + return X86::isVEXTRACT256Index(N); +}], EXTRACT_get_vextract256_imm>; -def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, +def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), (insert_subvector node:$bigvec, node:$smallvec, node:$index), [{ - return X86::isVINSERTF128Index(N); -}], INSERT_get_vinsertf128_imm>; + return X86::isVINSERT256Index(N); +}], INSERT_get_vinsert256_imm>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 17714ac..0443a93 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -97,7 +97,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) (tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), - TM(tm), RI(tm, *this) { + TM(tm), RI(tm) { static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -451,9 +451,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 }, - { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 }, - { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 }, { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, @@ -1381,7 +1378,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - case X86::MOVZX64rr8: if (!TM.getSubtarget<X86Subtarget>().is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. @@ -1389,9 +1385,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: - case X86::MOVZX64rr16: - case X86::MOVSX64rr32: - case X86::MOVZX64rr32: { + case X86::MOVSX64rr32: { if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) // Be conservative. return false; @@ -1404,17 +1398,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - case X86::MOVZX64rr8: SubIdx = X86::sub_8bit; break; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: - case X86::MOVZX64rr16: SubIdx = X86::sub_16bit; break; case X86::MOVSX64rr32: - case X86::MOVZX64rr32: SubIdx = X86::sub_32bit; break; } @@ -1722,37 +1713,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - DebugLoc DL = Orig->getDebugLoc(); - - // MOV32r0 etc. are implemented with xor which clobbers condition code. - // Re-materialize them as movri instructions to avoid side effects. - bool Clone = true; + // MOV32r0 is implemented with a xor which clobbers condition code. + // Re-materialize it as movri instructions to avoid side effects. unsigned Opc = Orig->getOpcode(); - switch (Opc) { - default: break; - case X86::MOV8r0: - case X86::MOV16r0: - case X86::MOV32r0: - case X86::MOV64r0: { - if (!isSafeToClobberEFLAGS(MBB, I)) { - switch (Opc) { - default: llvm_unreachable("Unreachable!"); - case X86::MOV8r0: Opc = X86::MOV8ri; break; - case X86::MOV16r0: Opc = X86::MOV16ri; break; - case X86::MOV32r0: Opc = X86::MOV32ri; break; - case X86::MOV64r0: Opc = X86::MOV64ri64i32; break; - } - Clone = false; - } - break; - } - } - - if (Clone) { + if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + DebugLoc DL = Orig->getDebugLoc(); + BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) + .addImm(0); + } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); - } else { - BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0); } MachineInstr *NewMI = prior(I); @@ -1772,6 +1742,98 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) { return false; } +/// getTruncatedShiftCount - check whether the shift count for a machine operand +/// is non-zero. +inline static unsigned getTruncatedShiftCount(MachineInstr *MI, + unsigned ShiftAmtOperandIdx) { + // The shift count is six bits with the REX.W prefix and five bits without. + unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31; + unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm(); + return Imm & ShiftCountMask; +} + +/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate +/// can be represented by a LEA instruction. +inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { + // Left shift instructions can be transformed into load-effective-address + // instructions if we can encode them appropriately. + // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // The SIB.scale field is two bits wide which means that we can encode any + // shift amount less than 4. + return ShAmt < 4 && ShAmt > 0; +} + +bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned Opc, bool AllowSP, + unsigned &NewSrc, bool &isKill, bool &isUndef, + MachineOperand &ImplicitOp) const { + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterClass *RC; + if (AllowSP) { + RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; + } else { + RC = Opc != X86::LEA32r ? + &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; + } + unsigned SrcReg = Src.getReg(); + + // For both LEA64 and LEA32 the register already has essentially the right + // type (32-bit or 64-bit) we may just need to forbid SP. + if (Opc != X86::LEA64_32r) { + NewSrc = SrcReg; + isKill = Src.isKill(); + isUndef = Src.isUndef(); + + if (TargetRegisterInfo::isVirtualRegister(NewSrc) && + !MF.getRegInfo().constrainRegClass(NewSrc, RC)) + return false; + + return true; + } + + // This is for an LEA64_32r and incoming registers are 32-bit. One way or + // another we need to add 64-bit registers to the final MI. + if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + ImplicitOp = Src; + ImplicitOp.setImplicit(); + + NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + MachineBasicBlock::LivenessQueryResult LQR = + MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); + + switch (LQR) { + case MachineBasicBlock::LQR_Unknown: + // We can't give sane liveness flags to the instruction, abandon LEA + // formation. + return false; + case MachineBasicBlock::LQR_Live: + isKill = MI->killsRegister(SrcReg); + isUndef = false; + break; + default: + // The physreg itself is dead, so we have to use it as an <undef>. + isKill = false; + isUndef = true; + break; + } + } else { + // Virtual register of the wrong class, we have to create a temporary 64-bit + // vreg to feed into the LEA. + NewSrc = MF.getRegInfo().createVirtualRegister(RC); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(TargetOpcode::COPY)) + .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) + .addOperand(Src); + + // Which is obviously going to be dead after we're done with it. + isKill = true; + isUndef = false; + } + + // We've set all the parameters without issue. + return true; +} + /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting /// to a 32-bit superregister and then truncating back down to a 16-bit @@ -1787,11 +1849,16 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, bool isDead = MI->getOperand(0).isDead(); bool isKill = MI->getOperand(1).isKill(); - unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() - ? X86::LEA64_32r : X86::LEA32r; MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); - unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); + unsigned Opc, leaInReg; + if (TM.getSubtarget<X86Subtarget>().is64Bit()) { + Opc = X86::LEA64_32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + } else { + Opc = X86::LEA32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + } // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. @@ -1841,7 +1908,10 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { - leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + else + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); @@ -1891,6 +1961,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const { MachineInstr *MI = MBBI; + + // The following opcodes also sets the condition code register(s). Only + // convert them to equivalent lea if the condition code register def's + // are dead! + if (hasLiveCondCodeDef(MI)) + return 0; + MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI->getOperand(0); @@ -1935,10 +2012,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && @@ -1953,29 +2028,34 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHL32ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; // LEA can't handle ESP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), - &X86::GR32_NOSPRegClass)) + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) - .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + .addReg(0).addImm(1 << ShAmt) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addImm(0).addReg(0); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + NewMI = MIB; + break; } case X86::SHL16ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; @@ -1985,11 +2065,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } default: { - // The following opcodes also sets the condition code register(s). Only - // convert them to equivalent lea if the condition code register def's - // are dead! - if (hasLiveCondCodeDef(MI)) - return 0; switch (MIOpc) { default: return 0; @@ -1999,17 +2074,20 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - const TargetRegisterClass *RC = MIOpc == X86::INC64r ? - (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : - (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; - - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), 1); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, 1); break; } case X86::INC16r: @@ -2026,16 +2104,22 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - const TargetRegisterClass *RC = MIOpc == X86::DEC64r ? - (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : - (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), -1); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, -1); + break; } case X86::DEC16r: @@ -2052,36 +2136,41 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32rr_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; - const TargetRegisterClass *RC; - if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; - RC = &X86::GR64_NOSPRegClass; - } else { + else Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - RC = &X86::GR32_NOSPRegClass; - } - - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return 0; - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src2) && - !MF.getRegInfo().constrainRegClass(Src2, RC)) + const MachineOperand &Src2 = MI->getOperand(2); + bool isKill2, isUndef2; + unsigned SrcReg2; + MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, + SrcReg2, isKill2, isUndef2, ImplicitOp2)) return 0; - NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + if (ImplicitOp2.getReg() != 0) + MIB.addOperand(ImplicitOp2); + + NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); // Preserve undefness of the operands. - bool isUndef = MI->getOperand(1).isUndef(); - bool isUndef2 = MI->getOperand(2).isUndef(); NewMI->getOperand(1).setIsUndef(isUndef); NewMI->getOperand(3).setIsUndef(isUndef2); - if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, NewMI); + if (LV && Src2.isKill()) + LV->replaceKillInstruction(SrcReg2, MI, NewMI); break; } case X86::ADD16rr: @@ -2120,9 +2209,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32ri8_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return 0; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, MI->getOperand(2).getImm()); break; } case X86::ADD16ri: @@ -3171,6 +3272,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, inline static bool isDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return false; + + // The shift instructions only modify ZF if their shift count is non-zero. + // N.B.: The processor truncates the shift count depending on the encoding. + case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri: + case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri: + return getTruncatedShiftCount(MI, 2) != 0; + + // Some left shift instructions can be turned into LEA instructions but only + // if their flags aren't used. Avoid transforming such instructions. + case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{ + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (isTruncatedShiftCountForLEA(ShAmt)) return false; + return ShAmt != 0; + } + + case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8: + case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8: + return getTruncatedShiftCount(MI, 3) != 0; + case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: @@ -3200,8 +3320,37 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: + case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: + case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: + case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: + case X86::ADC32ri: case X86::ADC32ri8: + case X86::ADC32rr: case X86::ADC64ri32: + case X86::ADC64ri8: case X86::ADC64rr: + case X86::SBB32ri: case X86::SBB32ri8: + case X86::SBB32rr: case X86::SBB64ri32: + case X86::SBB64ri8: case X86::SBB64rr: case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: case X86::ANDN64rm: + case X86::BEXTR32rr: case X86::BEXTR64rr: + case X86::BEXTR32rm: case X86::BEXTR64rm: + case X86::BLSI32rr: case X86::BLSI32rm: + case X86::BLSI64rr: case X86::BLSI64rm: + case X86::BLSMSK32rr:case X86::BLSMSK32rm: + case X86::BLSMSK64rr:case X86::BLSMSK64rm: + case X86::BLSR32rr: case X86::BLSR32rm: + case X86::BLSR64rr: case X86::BLSR64rm: + case X86::BZHI32rr: case X86::BZHI32rm: + case X86::BZHI64rr: case X86::BZHI64rm: + case X86::LZCNT16rr: case X86::LZCNT16rm: + case X86::LZCNT32rr: case X86::LZCNT32rm: + case X86::LZCNT64rr: case X86::LZCNT64rm: + case X86::POPCNT16rr:case X86::POPCNT16rm: + case X86::POPCNT32rr:case X86::POPCNT32rm: + case X86::POPCNT64rr:case X86::POPCNT64rm: + case X86::TZCNT16rr: case X86::TZCNT16rm: + case X86::TZCNT32rr: case X86::TZCNT32rm: + case X86::TZCNT64rr: case X86::TZCNT64rm: return true; } } @@ -3308,10 +3457,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. - if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 || - Instr->getOpcode() == X86::MOV16r0 || - Instr->getOpcode() == X86::MOV32r0 || - Instr->getOpcode() == X86::MOV64r0) && + if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 && Instr->registerDefIsDead(X86::EFLAGS, TRI)) { Movr0Inst = Instr; continue; @@ -3420,20 +3566,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // The instruction to be updated is either Sub or MI. Sub = IsCmpZero ? MI : Sub; - // Move Movr0Inst to the place right before Sub. + // Move Movr0Inst to the appropriate place before Sub. if (Movr0Inst) { - Sub->getParent()->remove(Movr0Inst); - Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst); + // Look backwards until we find a def that doesn't use the current EFLAGS. + Def = Sub; + MachineBasicBlock::reverse_iterator + InsertI = MachineBasicBlock::reverse_iterator(++Def), + InsertE = Sub->getParent()->rend(); + for (; InsertI != InsertE; ++InsertI) { + MachineInstr *Instr = &*InsertI; + if (!Instr->readsRegister(X86::EFLAGS, TRI) && + Instr->modifiesRegister(X86::EFLAGS, TRI)) { + Sub->getParent()->remove(Movr0Inst); + Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), + Movr0Inst); + break; + } + } + if (InsertI == InsertE) + return false; } // Make sure Sub instruction defines EFLAGS and mark the def live. - unsigned LastOperand = Sub->getNumOperands() - 1; - assert(Sub->getNumOperands() >= 2 && - Sub->getOperand(LastOperand).isReg() && - Sub->getOperand(LastOperand).getReg() == X86::EFLAGS && - "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND"); - Sub->getOperand(LastOperand).setIsDef(true); - Sub->getOperand(LastOperand).setIsDead(false); + unsigned i = 0, e = Sub->getNumOperands(); + for (; i != e; ++i) { + MachineOperand &MO = Sub->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + MO.setIsDead(false); + break; + } + } + assert(i != e && "Unable to locate a def EFLAGS operand"); + CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. @@ -3569,19 +3733,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -MachineInstr* -X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { - X86AddressMode AM; - AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = FrameIx; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE)); - addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr); - return &*MIB; -} - static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, const SmallVectorImpl<MachineOperand> &MOs, MachineInstr *MI, @@ -3655,7 +3806,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Align) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect(); bool isTwoAddrFold = false; + + // Atom favors register form of call. So, we do not fold loads into calls + // when X86Subtarget is Atom. + if (isCallRegIndirect && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { + return NULL; + } + unsigned NumOps = MI->getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; @@ -3677,18 +3837,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 - unsigned Opc = 0; - switch (MI->getOpcode()) { - default: break; - case X86::MOV64r0: Opc = X86::MOV64mi32; break; - case X86::MOV32r0: Opc = X86::MOV32mi; break; - case X86::MOV16r0: Opc = X86::MOV16mi; break; - case X86::MOV8r0: Opc = X86::MOV8mi; break; + if (MI->getOpcode() == X86::MOV32r0) { + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); + if (NewMI) + return NewMI; } - if (Opc) - NewMI = MakeM0Inst(*this, Opc, MOs, MI); - if (NewMI) - return NewMI; OpcodeTablePtr = &RegOp2MemOpTable0; } else if (i == 1) { @@ -4074,13 +4227,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 - switch (Opc) { - case X86::MOV8r0: - case X86::MOV16r0: - case X86::MOV32r0: - case X86::MOV64r0: return true; - default: break; - } + if (Opc == X86::MOV32r0) + return true; + OpcodeTablePtr = &RegOp2MemOpTable0; } else if (OpNum == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; @@ -4241,7 +4390,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, std::vector<SDValue> AddrOps; std::vector<SDValue> BeforeOps; std::vector<SDValue> AfterOps; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); unsigned NumOps = N->getNumOperands(); for (unsigned i = 0; i != NumOps-1; ++i) { SDValue Op = N->getOperand(i); @@ -4272,7 +4421,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, - VT, MVT::Other, &AddrOps[0], AddrOps.size()); + VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. @@ -4294,8 +4443,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, if (Load) BeforeOps.push_back(SDValue(Load, 0)); std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); - SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0], - BeforeOps.size()); + SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); // Emit the store instruction. @@ -4317,8 +4465,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, TM), - dl, MVT::Other, - &AddrOps[0], AddrOps.size()); + dl, MVT::Other, AddrOps); NewNodes.push_back(Store); // Preserve memory reference information. @@ -4500,6 +4647,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!TM.getSubtarget<X86Subtarget>().hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + switch(Second->getOpcode()) { + default: + return false; + case X86::JE_4: + case X86::JNE_4: + case X86::JL_4: + case X86::JLE_4: + case X86::JG_4: + case X86::JGE_4: + FuseKind = FuseInc; + break; + case X86::JB_4: + case X86::JBE_4: + case X86::JA_4: + case X86::JAE_4: + FuseKind = FuseCmp; + break; + case X86::JS_4: + case X86::JNS_4: + case X86::JP_4: + case X86::JNP_4: + case X86::JO_4: + case X86::JNO_4: + FuseKind = FuseTest; + break; + } + switch (First->getOpcode()) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64_16r: + case X86::INC64_32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64_16r: + case X86::DEC64_32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + } +} bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 260f054..a0d1ba7 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -192,6 +192,19 @@ public: const MachineInstr *Orig, const TargetRegisterInfo &TRI) const; + /// Given an operand within a MachineInstr, insert preceding code to put it + /// into the right format for a particular kind of LEA instruction. This may + /// involve using an appropriate super-register instead (with an implicit use + /// of the original) or creating a new virtual register and inserting COPY + /// instructions to get the data into the right class. + /// + /// Reference parameters are set to indicate how caller should add this + /// operand to the LEA instruction. + bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned LEAOpcode, bool AllowSP, + unsigned &NewSrc, bool &isKill, + bool &isUndef, MachineOperand &ImplicitOp) const; + /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true @@ -262,12 +275,6 @@ public: virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; - virtual - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const; - /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the @@ -332,6 +339,9 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const; + virtual bool shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const LLVM_OVERRIDE; + virtual void getNoopForMachoTarget(MCInst &NopInst) const; virtual diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index d989ec7..0960a2a 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -142,6 +142,9 @@ def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; +def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -314,6 +317,16 @@ def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; let PredicateMethod = "isMemVY64"; } +def X86MemVZ64Operand : AsmOperandClass { + let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64"; +} +def X86MemVZ32Operand : AsmOperandClass { + let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32"; +} +def X86Mem512AsmOperand : AsmOperandClass { + let Name = "Mem512"; let PredicateMethod = "isMem512"; +} + def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; @@ -342,6 +355,8 @@ def i128mem : X86MemOperand<"printi128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } def i256mem : X86MemOperand<"printi256mem"> { let ParserMatchClass = X86Mem256AsmOperand; } +def i512mem : X86MemOperand<"printi512mem"> { + let ParserMatchClass = X86Mem512AsmOperand; } def f32mem : X86MemOperand<"printf32mem"> { let ParserMatchClass = X86Mem32AsmOperand; } def f64mem : X86MemOperand<"printf64mem"> { @@ -352,6 +367,12 @@ def f128mem : X86MemOperand<"printf128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } def f256mem : X86MemOperand<"printf256mem">{ let ParserMatchClass = X86Mem256AsmOperand; } +def f512mem : X86MemOperand<"printf512mem">{ + let ParserMatchClass = X86Mem512AsmOperand; } +def v512mem : Operand<iPTR> { + let PrintMethod = "printf512mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86Mem512AsmOperand; } // Gather mem operands def vx32mem : X86MemOperand<"printi32mem">{ @@ -366,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{ def vy64mem : X86MemOperand<"printi64mem">{ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); let ParserMatchClass = X86MemVY64Operand; } +def vy64xmem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm); + let ParserMatchClass = X86MemVY64Operand; } +def vz32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ32Operand; } +def vz64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ64Operand; } } // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of @@ -520,8 +550,7 @@ def i64i8imm : Operand<i64> { def lea64_32mem : Operand<i32> { let PrintMethod = "printi32mem"; - let AsmOperandLowerMethod = "lower_lea64_32mem"; - let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm); + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } @@ -543,7 +572,7 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex], []>; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. -def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr", +def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; @@ -588,11 +617,19 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasEMI : Predicate<"Subtarget->hasERI()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; +def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; @@ -603,7 +640,12 @@ def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; def HasRTM : Predicate<"Subtarget->hasRTM()">; +def HasHLE : Predicate<"Subtarget->hasHLE()">; +def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; +def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; +def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; @@ -626,6 +668,7 @@ def OptForSize : Predicate<"OptForSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; +def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. @@ -758,7 +801,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ // // Nop -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, SchedRW = [WriteZero] in { def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero), "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize; @@ -769,8 +812,9 @@ let neverHasSideEffects = 1 in { // Constructing a stack frame. def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), - "enter\t$len, $lvl", [], IIC_ENTER>; + "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; +let SchedRW = [WriteALU] in { let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, @@ -780,32 +824,33 @@ let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[In64BitMode]>; +} // SchedRW //===----------------------------------------------------------------------===// // Miscellaneous Instructions. // let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { -let mayLoad = 1 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG16>, OpSize; def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG>, OpSize; -def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [], +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], IIC_POP_MEM>, OpSize; def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; -def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [], +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>; def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize; def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, Requires<[In32BitMode]>; -} +} // mayLoad, SchedRW -let mayStore = 1 in { +let mayStore = 1, SchedRW = [WriteStore] in { def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize; def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], @@ -832,29 +877,30 @@ def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, Requires<[In32BitMode]>; -} +} // mayStore, SchedRW } let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { -let mayLoad = 1 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; -def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [], +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], IIC_POP_MEM>; -} -let mayStore = 1 in { +} // mayLoad, SchedRW +let mayStore = 1, SchedRW = [WriteStore] in { def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>; def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>; def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], IIC_PUSH_MEM>; -} +} // mayStore, SchedRW } -let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in { +let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1, + SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>; def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), @@ -865,23 +911,24 @@ def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm), let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, - Requires<[In64BitMode]>; + Requires<[In64BitMode]>, Sched<[WriteLoad]>; let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, - Requires<[In64BitMode]>; + Requires<[In64BitMode]>, Sched<[WriteStore]>; let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], - mayLoad=1, neverHasSideEffects=1 in { -def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>, + mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>, Requires<[In32BitMode]>; } let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], - mayStore=1, neverHasSideEffects=1 in { -def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>, + mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>, Requires<[In32BitMode]>; } -let Constraints = "$src = $dst" in { // GR32 = bswap GR32 +let Constraints = "$src = $dst", SchedRW = [WriteALU] in { +// GR32 = bswap GR32 def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "bswap{l}\t$dst", @@ -890,60 +937,63 @@ def BSWAP32r : I<0xC8, AddRegFrm, def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), "bswap{q}\t$dst", [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB; -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW // Bit scan instructions. let Defs = [EFLAGS] in { def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], - IIC_BSF>, TB, OpSize; + IIC_BSF>, TB, OpSize, Sched<[WriteShift]>; def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], - IIC_BSF>, TB, OpSize; + IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>; def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB; + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB, + Sched<[WriteShift]>; def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], - IIC_BSF>, TB; + IIC_BSF>, TB, Sched<[WriteShiftLd]>; def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], - IIC_BSF>, TB; + IIC_BSF>, TB, Sched<[WriteShift]>; def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], - IIC_BSF>, TB; + IIC_BSF>, TB, Sched<[WriteShiftLd]>; def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>, - TB, OpSize; + TB, OpSize, Sched<[WriteShift]>; def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], IIC_BSR>, TB, - OpSize; + OpSize, Sched<[WriteShiftLd]>; def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB; + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB, + Sched<[WriteShift]>; def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], - IIC_BSR>, TB; + IIC_BSR>, TB, Sched<[WriteShiftLd]>; def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB; + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB, + Sched<[WriteShift]>; def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], - IIC_BSR>, TB; + IIC_BSR>, TB, Sched<[WriteShiftLd]>; } // Defs = [EFLAGS] - +let SchedRW = [WriteMicrocoded] in { // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>; @@ -971,12 +1021,12 @@ def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>; def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize; def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>; def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>; - +} // SchedRW //===----------------------------------------------------------------------===// // Move Instructions. // - +let SchedRW = [WriteMove] in { let neverHasSideEffects = 1 in { def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; @@ -987,6 +1037,7 @@ def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; } + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", @@ -1004,7 +1055,9 @@ def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; } +} // SchedRW +let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; @@ -1017,45 +1070,60 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW /// moffs8, moffs16 and moffs32 versions of moves. The immediate is a /// 32-bit offset from the PC. These are only valid in x86-32 mode. +let SchedRW = [WriteALU] in { def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>, + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>, + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>, + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>, + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; +} -// FIXME: These definitions are utterly broken -// Just leave them commented out for now because they're useless outside -// of the large code model, and most compilers won't generate the instructions -// in question. -/* -def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -*/ - - -let isCodeGenOnly = 1, hasSideEffects = 0 in { +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, + Requires<[In64BitMode]>; + +def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, + Requires<[In64BitMode]>; +def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, + Requires<[In64BitMode]>; +def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, + Requires<[In64BitMode]>; + +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), @@ -1066,7 +1134,7 @@ def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; } -let canFoldAsLoad = 1, isReMaterializable = 1 in { +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>; @@ -1081,6 +1149,7 @@ def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>; } +let SchedRW = [WriteStore] in { def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>; @@ -1093,6 +1162,7 @@ def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so // that they can be used for copying and storing h registers, which can't be @@ -1101,34 +1171,37 @@ let isCodeGenOnly = 1 in { let neverHasSideEffects = 1 in def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), - "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>; + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, + Sched<[WriteMove]>; let mayStore = 1 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], - IIC_MOV_MEM>; + IIC_MOV_MEM>, Sched<[WriteStore]>; let mayLoad = 1, neverHasSideEffects = 1, canFoldAsLoad = 1, isReMaterializable = 1 in def MOV8rm_NOREX : I<0x8A, MRMSrcMem, (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], - IIC_MOV_MEM>; + IIC_MOV_MEM>, Sched<[WriteLoad]>; } // Condition code ops, incl. set if equal/not equal/... +let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", [(set EFLAGS, (X86sahf AH))], IIC_AHF>; let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], IIC_AHF>; // AH = flags - +} // SchedRW //===----------------------------------------------------------------------===// // Bit tests instructions: BT, BTS, BTR, BTC. let Defs = [EFLAGS] in { +let SchedRW = [WriteALU] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, @@ -1139,13 +1212,14 @@ def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB; +} // SchedRW // Unlike with the register+register form, the memory+register form of the // bt instruction does not ignore the high bits of the index. From ISel's // perspective, this is pretty bizarre. Make these instructions disassembly // only for now. -let mayLoad = 1, hasSideEffects = 0 in { +let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", // [(X86bt (loadi16 addr:$src1), GR16:$src2), @@ -1166,6 +1240,7 @@ let mayLoad = 1, hasSideEffects = 0 in { >, TB; } +let SchedRW = [WriteALU] in { def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))], @@ -1178,10 +1253,12 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))], IIC_BT_RI>, TB; +} // SchedRW // Note that these instructions don't need FastBTMem because that // only applies when the other operand is in a register. When it's // an immediate, bt is still fast. +let SchedRW = [WriteALU] in { def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) @@ -1194,8 +1271,10 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), i64immSExt8:$src2))], IIC_BT_MI>, TB; +} // SchedRW let hasSideEffects = 0 in { +let SchedRW = [WriteALU] in { def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize, TB; @@ -1203,8 +1282,9 @@ def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1214,6 +1294,7 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; } +let SchedRW = [WriteALU] in { def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1221,8 +1302,9 @@ def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1232,6 +1314,7 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; } +let SchedRW = [WriteALU] in { def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize, TB; @@ -1239,8 +1322,9 @@ def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1250,6 +1334,7 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; } +let SchedRW = [WriteALU] in { def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1257,8 +1342,9 @@ def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1268,6 +1354,7 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; } +let SchedRW = [WriteALU] in { def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize, TB; @@ -1275,8 +1362,9 @@ def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize, TB; @@ -1286,6 +1374,7 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; } +let SchedRW = [WriteALU] in { def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize, TB; @@ -1293,8 +1382,9 @@ def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize, TB; @@ -1315,7 +1405,7 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), // operand is referenced, the atomicity is ensured. multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, InstrItinClass itin> { - let Constraints = "$val = $dst" in { + let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), @@ -1350,6 +1440,7 @@ multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>; // Swap between registers. +let SchedRW = [WriteALU] in { let Constraints = "$val = $dst" in { def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; @@ -1363,20 +1454,20 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), // Swap between EAX and other registers. def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), - "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; + "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize; def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In32BitMode]>; // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In64BitMode]>; def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), - "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>; - - + "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>; +} // SchedRW +let SchedRW = [WriteALU] in { def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), @@ -1386,8 +1477,9 @@ def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +} // SchedRW -let mayLoad = 1, mayStore = 1 in { +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), @@ -1400,6 +1492,7 @@ def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), } +let SchedRW = [WriteALU] in { def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG8>, TB; @@ -1412,7 +1505,9 @@ def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG>, TB; +} // SchedRW +let SchedRW = [WriteALULd, WriteRMW] in { let mayLoad = 1, mayStore = 1 in { def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], @@ -1436,7 +1531,7 @@ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, TB, Requires<[HasCmpxchg16b]>; - +} // SchedRW // Lock instruction prefix @@ -1459,17 +1554,21 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; // String manipulation instructions +let SchedRW = [WriteMicrocoded] in { def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>; def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize; def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>; def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>; +} +let SchedRW = [WriteSystem] in { def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>; def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize; def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>; - +} // Flag instructions +let SchedRW = [WriteALU] in { def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; @@ -1479,10 +1578,13 @@ def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; +} // Table lookup instructions -def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>; +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, + Sched<[WriteLoad]>; +let SchedRW = [WriteMicrocoded] in { // ASCII Adjust After Addition // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, @@ -1512,7 +1614,9 @@ def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, Requires<[In32BitMode]>; +} // SchedRW +let SchedRW = [WriteSystem] in { // Check Array Index Against Bounds def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize, @@ -1528,11 +1632,13 @@ def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, Requires<[In32BitMode]>; +} // SchedRW //===----------------------------------------------------------------------===// // MOVBE Instructions // let Predicates = [HasMOVBE] in { + let SchedRW = [WriteALULd] in { def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "movbe{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>, @@ -1545,6 +1651,8 @@ let Predicates = [HasMOVBE] in { "movbe{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>, T8; + } + let SchedRW = [WriteStore] in { def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "movbe{w}\t{$src, $dst|$dst, $src}", [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>, @@ -1557,6 +1665,7 @@ let Predicates = [HasMOVBE] in { "movbe{q}\t{$src, $dst|$dst, $src}", [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>, T8; + } } //===----------------------------------------------------------------------===// @@ -1575,6 +1684,21 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS] in { } //===----------------------------------------------------------------------===// +// RDSEED Instruction +// +let Predicates = [HasRDSEED], Defs = [EFLAGS] in { + def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), + "rdseed{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB; + def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), + "rdseed{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB; + def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), + "rdseed{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB; +} + +//===----------------------------------------------------------------------===// // LZCNT Instruction // let Predicates = [HasLZCNT], Defs = [EFLAGS] in { @@ -1737,6 +1861,7 @@ include "X86InstrXOP.td" // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" +include "X86InstrAVX512.td" include "X86InstrMMX.td" include "X86Instr3DNow.td" @@ -1755,115 +1880,120 @@ include "X86InstrCompiler.td" // Assembler Mnemonic Aliases //===----------------------------------------------------------------------===// -def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>; -def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>; +def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"cbw", "cbtw">; -def : MnemonicAlias<"cwde", "cwtl">; -def : MnemonicAlias<"cwd", "cwtd">; -def : MnemonicAlias<"cdq", "cltd">; -def : MnemonicAlias<"cdqe", "cltq">; -def : MnemonicAlias<"cqo", "cqto">; +def : MnemonicAlias<"cbw", "cbtw", "att">; +def : MnemonicAlias<"cwde", "cwtl", "att">; +def : MnemonicAlias<"cwd", "cwtd", "att">; +def : MnemonicAlias<"cdq", "cltd", "att">; +def : MnemonicAlias<"cdqe", "cltq", "att">; +def : MnemonicAlias<"cqo", "cqto", "att">; // lret maps to lretl, it is not ambiguous with lretq. -def : MnemonicAlias<"lret", "lretl">; +def : MnemonicAlias<"lret", "lretl", "att">; -def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>; -def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>; +def : MnemonicAlias<"leavel", "leave", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"loopz", "loope">; -def : MnemonicAlias<"loopnz", "loopne">; +def : MnemonicAlias<"loopz", "loope", "att">; +def : MnemonicAlias<"loopnz", "loopne", "att">; -def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>; -def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>; -def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>; -def : MnemonicAlias<"popf", "popfq">, Requires<[In64BitMode]>; -def : MnemonicAlias<"popfd", "popfl">; +def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl", "att">; // FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in // all modes. However: "push (addr)" and "push $42" should default to // pushl/pushq depending on the current mode. Similar for "pop %bx" -def : MnemonicAlias<"push", "pushl">, Requires<[In32BitMode]>; -def : MnemonicAlias<"push", "pushq">, Requires<[In64BitMode]>; -def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>; -def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>; -def : MnemonicAlias<"pushfd", "pushfl">; +def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl", "att">; + +def : MnemonicAlias<"popad", "popa", "intel">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushad", "pusha", "intel">, Requires<[In32BitMode]>; -def : MnemonicAlias<"repe", "rep">; -def : MnemonicAlias<"repz", "rep">; -def : MnemonicAlias<"repnz", "repne">; +def : MnemonicAlias<"repe", "rep", "att">; +def : MnemonicAlias<"repz", "rep", "att">; +def : MnemonicAlias<"repnz", "repne", "att">; -def : MnemonicAlias<"retl", "ret">, Requires<[In32BitMode]>; -def : MnemonicAlias<"retq", "ret">, Requires<[In64BitMode]>; +def : MnemonicAlias<"retl", "ret", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retq", "ret", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"salb", "shlb">; -def : MnemonicAlias<"salw", "shlw">; -def : MnemonicAlias<"sall", "shll">; -def : MnemonicAlias<"salq", "shlq">; +def : MnemonicAlias<"salb", "shlb", "att">; +def : MnemonicAlias<"salw", "shlw", "att">; +def : MnemonicAlias<"sall", "shll", "att">; +def : MnemonicAlias<"salq", "shlq", "att">; -def : MnemonicAlias<"smovb", "movsb">; -def : MnemonicAlias<"smovw", "movsw">; -def : MnemonicAlias<"smovl", "movsl">; -def : MnemonicAlias<"smovq", "movsq">; +def : MnemonicAlias<"smovb", "movsb", "att">; +def : MnemonicAlias<"smovw", "movsw", "att">; +def : MnemonicAlias<"smovl", "movsl", "att">; +def : MnemonicAlias<"smovq", "movsq", "att">; -def : MnemonicAlias<"ud2a", "ud2">; -def : MnemonicAlias<"verrw", "verr">; +def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"verrw", "verr", "att">; // System instruction aliases. -def : MnemonicAlias<"iret", "iretl">; -def : MnemonicAlias<"sysret", "sysretl">; -def : MnemonicAlias<"sysexit", "sysexitl">; +def : MnemonicAlias<"iret", "iretl", "att">; +def : MnemonicAlias<"sysret", "sysretl", "att">; +def : MnemonicAlias<"sysexit", "sysexitl", "att">; -def : MnemonicAlias<"lgdtl", "lgdt">, Requires<[In32BitMode]>; -def : MnemonicAlias<"lgdtq", "lgdt">, Requires<[In64BitMode]>; -def : MnemonicAlias<"lidtl", "lidt">, Requires<[In32BitMode]>; -def : MnemonicAlias<"lidtq", "lidt">, Requires<[In64BitMode]>; -def : MnemonicAlias<"sgdtl", "sgdt">, Requires<[In32BitMode]>; -def : MnemonicAlias<"sgdtq", "sgdt">, Requires<[In64BitMode]>; -def : MnemonicAlias<"sidtl", "sidt">, Requires<[In32BitMode]>; -def : MnemonicAlias<"sidtq", "sidt">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lgdtl", "lgdt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdtq", "lgdt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidtl", "lidt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidtq", "lidt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdtl", "sgdt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdtq", "sgdt", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidtl", "sidt", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidtq", "sidt", "att">, Requires<[In64BitMode]>; // Floating point stack aliases. -def : MnemonicAlias<"fcmovz", "fcmove">; -def : MnemonicAlias<"fcmova", "fcmovnbe">; -def : MnemonicAlias<"fcmovnae", "fcmovb">; -def : MnemonicAlias<"fcmovna", "fcmovbe">; -def : MnemonicAlias<"fcmovae", "fcmovnb">; -def : MnemonicAlias<"fcomip", "fcompi">; -def : MnemonicAlias<"fildq", "fildll">; -def : MnemonicAlias<"fistpq", "fistpll">; -def : MnemonicAlias<"fisttpq", "fisttpll">; -def : MnemonicAlias<"fldcww", "fldcw">; -def : MnemonicAlias<"fnstcww", "fnstcw">; -def : MnemonicAlias<"fnstsww", "fnstsw">; -def : MnemonicAlias<"fucomip", "fucompi">; -def : MnemonicAlias<"fwait", "wait">; - - -class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond> +def : MnemonicAlias<"fcmovz", "fcmove", "att">; +def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; +def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; +def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; +def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; +def : MnemonicAlias<"fcomip", "fcompi", "att">; +def : MnemonicAlias<"fildq", "fildll", "att">; +def : MnemonicAlias<"fistpq", "fistpll", "att">; +def : MnemonicAlias<"fisttpq", "fisttpll", "att">; +def : MnemonicAlias<"fldcww", "fldcw", "att">; +def : MnemonicAlias<"fnstcww", "fnstcw", "att">; +def : MnemonicAlias<"fnstsww", "fnstsw", "att">; +def : MnemonicAlias<"fucomip", "fucompi", "att">; +def : MnemonicAlias<"fwait", "wait", "att">; + + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, + string VariantName> : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), - !strconcat(Prefix, NewCond, Suffix)>; + !strconcat(Prefix, NewCond, Suffix), VariantName>; /// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of /// MnemonicAlias's that canonicalize the condition code in a mnemonic, for /// example "setz" -> "sete". -multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> { - def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb - def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete - def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe - def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae - def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae - def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle - def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge - def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne - def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp - def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp - - def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb - def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta - def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl - def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix, + string V = ""> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg } // Aliases for set<CC> @@ -1871,9 +2001,11 @@ defm : IntegerCondCodeMnemonicAlias<"set", "">; // Aliases for j<CC> defm : IntegerCondCodeMnemonicAlias<"j", "">; // Aliases for cmov<CC>{w,l,q} -defm : IntegerCondCodeMnemonicAlias<"cmov", "w">; -defm : IntegerCondCodeMnemonicAlias<"cmov", "l">; -defm : IntegerCondCodeMnemonicAlias<"cmov", "q">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; +// No size suffix for intel-style asm. +defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// @@ -1885,75 +2017,83 @@ def : InstAlias<"aad", (AAD8i8 10)>; def : InstAlias<"aam", (AAM8i8 10)>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. -def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; +// Likewise for btc/btr/bts. +def : InstAlias<"bt {$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btc {$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btr {$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"bts {$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. -def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; -def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; -def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>; -def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>; +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // div and idiv aliases for explicit A register. -def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>; -def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; -def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; -def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; -def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; -def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; -def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; -def : InstAlias<"fxch", (XCH_F ST1)>; -def : InstAlias<"fcom", (COM_FST0r ST1)>; -def : InstAlias<"fcomp", (COMP_FST0r ST1)>; -def : InstAlias<"fcomi", (COM_FIr ST1)>; -def : InstAlias<"fcompi", (COM_FIPr ST1)>; -def : InstAlias<"fucom", (UCOM_Fr ST1)>; -def : InstAlias<"fucomp", (UCOM_FPr ST1)>; -def : InstAlias<"fucomi", (UCOM_FIr ST1)>; -def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; // Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), (Inst RST:$op), EmitAlias>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; -defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; -defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; defm : FpUnaryAlias<"fmul", MUL_FST0r>; defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; defm : FpUnaryAlias<"fdiv", DIV_FST0r>; -defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; -defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; @@ -1963,16 +2103,16 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; -def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; -def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; -def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; -def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; -def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; // We accept "fnstsw %eax" even though it only writes %ax. -def : InstAlias<"fnstsw %eax", (FNSTSW16r)>; -def : InstAlias<"fnstsw %al" , (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>; def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but @@ -1991,12 +2131,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; // inb %dx -> inb %al, %dx -def : InstAlias<"inb %dx", (IN8rr)>; -def : InstAlias<"inw %dx", (IN16rr)>; -def : InstAlias<"inl %dx", (IN32rr)>; -def : InstAlias<"inb $port", (IN8ri i8imm:$port)>; -def : InstAlias<"inw $port", (IN16ri i8imm:$port)>; -def : InstAlias<"inl $port", (IN32ri i8imm:$port)>; +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp @@ -2024,7 +2164,7 @@ def : InstAlias<"movq $src, $dst", // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) -def : InstAlias<"movsd", (MOVSD)>; +def : InstAlias<"movsd", (MOVSD), 0>; // movsx aliases def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; @@ -2045,12 +2185,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx -def : InstAlias<"outb %dx", (OUT8rr)>; -def : InstAlias<"outw %dx", (OUT16rr)>; -def : InstAlias<"outl %dx", (OUT32rr)>; -def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>; -def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>; -def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2058,19 +2198,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; // shld/shrd op,op -> shld op, op, CL -def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>; -def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>; - -def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>; -def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>; +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; /* FIXME: This is disabled because the asm matcher is currently incapable of * matching a fixed immediate like $1. @@ -2101,19 +2241,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. -def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. -def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. -def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; -def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>; +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 127af6f..cb12956 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -20,6 +20,7 @@ // MMX Multiclasses //===----------------------------------------------------------------------===// +let Sched = WriteVecALU in { def MMX_INTALU_ITINS : OpndItins< IIC_MMX_ALU_RR, IIC_MMX_ALU_RM >; @@ -35,11 +36,14 @@ def MMX_PHADDSUBW : OpndItins< def MMX_PHADDSUBD : OpndItins< IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM >; +} +let Sched = WriteVecIMul in def MMX_PMUL_ITINS : OpndItins< IIC_MMX_PMUL, IIC_MMX_PMUL >; +let Sched = WriteVecALU in { def MMX_PSADBW_ITINS : OpndItins< IIC_MMX_PSADBW, IIC_MMX_PSADBW >; @@ -47,11 +51,13 @@ def MMX_PSADBW_ITINS : OpndItins< def MMX_MISC_FUNC_ITINS : OpndItins< IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG >; +} def MMX_SHIFT_ITINS : ShiftOpndItins< IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI >; +let Sched = WriteShuffle in { def MMX_UNPCK_H_ITINS : OpndItins< IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM >; @@ -67,7 +73,9 @@ def MMX_PCK_ITINS : OpndItins< def MMX_PSHUF_ITINS : OpndItins< IIC_MMX_PSHUF, IIC_MMX_PSHUF >; +} // Sched +let Sched = WriteCvtF2I in { def MMX_CVT_PD_ITINS : OpndItins< IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM >; @@ -75,6 +83,7 @@ def MMX_CVT_PD_ITINS : OpndItins< def MMX_CVT_PS_ITINS : OpndItins< IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM >; +} let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. @@ -84,7 +93,8 @@ let Constraints = "$src1 = $dst" in { def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> { + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]> { let isCommutable = Commutable; } def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), @@ -92,7 +102,7 @@ let Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, (bitconvert (load_mmx addr:$src2))))], - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, @@ -101,17 +111,19 @@ let Constraints = "$src1 = $dst" in { def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>; + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[WriteVecShift]>; def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, (bitconvert (load_mmx addr:$src2))))], - itins.rm>; + itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), (ins VR64:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>; + [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>, + Sched<[WriteVecShift]>; } } @@ -120,13 +132,14 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, Intrinsic IntId64, OpndItins itins> { def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>; + [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>, + Sched<[itins.Sched]>; def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, (IntId64 (bitconvert (memopmmx addr:$src))))], - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded]>; } /// Binary MMX instructions requiring SSSE3. @@ -137,13 +150,15 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>; + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]>; def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId64 VR64:$src1, - (bitconvert (memopmmx addr:$src2))))], itins.rm>; + (bitconvert (memopmmx addr:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -164,21 +179,24 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins, Domain d> { def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>; + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>, + Sched<[itins.Sched]>; def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>; + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>, + Sched<[itins.Sched.Folded]>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { - def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2), - asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], - IIC_DEFAULT, d>; - def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), - (ins DstRC:$src1, x86memop:$src2), asm, - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], - IIC_DEFAULT, d>; + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), + (ins DstRC:$src1, SrcRC:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + NoItinerary, d>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + NoItinerary, d>; } //===----------------------------------------------------------------------===// @@ -197,16 +215,17 @@ def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (scalar_to_vector GR32:$src)))], - IIC_MMX_MOV_MM_RM>; + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; let canFoldAsLoad = 1 in def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (scalar_to_vector (loadi32 addr:$src))))], - IIC_MMX_MOV_MM_RM>; + IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), - "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>; + "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>, + Sched<[WriteStore]>; // Low word of MMX to GPR. def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, @@ -214,16 +233,18 @@ def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, - (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>; + (MMX_X86movd2w (x86mmx VR64:$src)))], + IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; let neverHasSideEffects = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", - [], IIC_MMX_MOV_MM_RM>; + [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. +let SchedRW = [WriteMove] in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", @@ -237,6 +258,9 @@ let neverHasSideEffects = 1 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; +} // SchedRW + +let SchedRW = [WriteLoad] in { let canFoldAsLoad = 1 in def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", @@ -246,7 +270,9 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (x86mmx VR64:$src), addr:$dst)], IIC_MMX_MOVQ_RM>; +} // SchedRW +let SchedRW = [WriteMove] in { def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, @@ -271,11 +297,12 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; +} // SchedRW def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], - IIC_MMX_MOVQ_RM>; + IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>; let AddedComplexity = 15 in // movd to MMX register zero-extends @@ -283,7 +310,7 @@ def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))], - IIC_MMX_MOV_MM_RM>; + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; let AddedComplexity = 20 in def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), @@ -291,7 +318,7 @@ def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), [(set VR64:$dst, (x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))))], - IIC_MMX_MOV_MM_RM>; + IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; // Arithmetic Instructions defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, @@ -331,21 +358,21 @@ defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, MMX_INTALU_ITINS>; defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, - MMX_INTALUQ_ITINS, 1>; + MMX_INTALUQ_ITINS>; defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, MMX_PHADDSUBW>; @@ -491,14 +518,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], - IIC_MMX_PSHUF>; + IIC_MMX_PSHUF>, Sched<[WriteShuffle]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), imm:$src2))], - IIC_MMX_PSHUF>; + IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>; @@ -532,7 +559,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1, (iPTR imm:$src2)))], - IIC_MMX_PEXTR>; + IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), @@ -540,7 +567,7 @@ let Constraints = "$src1 = $dst" in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32:$src2, (iPTR imm:$src3)))], - IIC_MMX_PINSRW>; + IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), @@ -549,7 +576,7 @@ let Constraints = "$src1 = $dst" in { [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), (iPTR imm:$src3)))], - IIC_MMX_PINSRW>; + IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } // Mask creation @@ -570,6 +597,7 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), (x86mmx (MMX_MOVQ64rm addr:$src))>; // Misc. +let SchedRW = [WriteShuffle] in { let Uses = [EDI] in def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", @@ -580,6 +608,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], IIC_MMX_MASKMOV>; +} // 64-bit bit convert. let Predicates = [HasSSE2] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 105963f..a86006a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -35,6 +35,7 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, // scalar +let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM >; @@ -42,6 +43,7 @@ def SSE_ALU_F32S : OpndItins< def SSE_ALU_F64S : OpndItins< IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM >; +} def SSE_ALU_ITINS_S : SizeItins< SSE_ALU_F32S, SSE_ALU_F64S @@ -76,6 +78,7 @@ def SSE_DIV_ITINS_S : SizeItins< >; // parallel +let Sched = WriteFAdd in { def SSE_ALU_F32P : OpndItins< IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM >; @@ -83,6 +86,7 @@ def SSE_ALU_F32P : OpndItins< def SSE_ALU_F64P : OpndItins< IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM >; +} def SSE_ALU_ITINS_P : SizeItins< SSE_ALU_F32P, SSE_ALU_F64P @@ -184,14 +188,16 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>( !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], itins.rr>; + RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, mem_cpat:$src2))], itins.rm>; + RC:$src1, mem_cpat:$src2))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } /// sse12_fp_packed - SSE 1 & 2 packed instructions class @@ -226,13 +232,13 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rr, IIC_DEFAULT, d>, + pat_rr, NoItinerary, d>, Sched<[WriteVecLogic]>; def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rm, IIC_DEFAULT, d>, + pat_rm, NoItinerary, d>, Sched<[WriteVecLogicLd, ReadAfterLd]>; } @@ -364,7 +370,7 @@ let Predicates = [HasAVX] in { // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1 in { + isPseudo = 1, SchedRW = [WriteZero] in { def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", @@ -381,7 +387,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1 in { + isPseudo = 1, SchedRW = [WriteZero] in { def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4f32 immAllZerosV))]>; } @@ -398,7 +404,7 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0)>; // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [HasAVX] in { + isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in { def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8f32 immAllZerosV))]>; } @@ -436,7 +442,7 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1 in { + isPseudo = 1, SchedRW = [WriteZero] in { def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; let Predicates = [HasAVX2] in @@ -470,7 +476,7 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), - [], IIC_SSE_MOV_S_RR>; + [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>; } multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, @@ -848,7 +854,7 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), } // SchedRW // For disassembler -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], @@ -924,7 +930,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), } // SchedRW // For disassembler -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1070,7 +1076,7 @@ let Predicates = [UseSSE1] in { // Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper // bits are disregarded. FIXME: Set encoding to pseudo! -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, SchedRW = [WriteMove] in { def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX; @@ -1087,7 +1093,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper // bits are disregarded. FIXME: Set encoding to pseudo! -let canFoldAsLoad = 1, isReMaterializable = 1 in { +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { let isCodeGenOnly = 1 in { def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), "movaps\t{$src, $dst|$dst, $src}", @@ -1436,11 +1442,13 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let neverHasSideEffects = 1 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2F]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2FLd, ReadAfterLd]>; } // neverHasSideEffects = 1 } @@ -1740,13 +1748,15 @@ let neverHasSideEffects = 1 in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG; + IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG; + XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; } def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, @@ -1755,26 +1765,28 @@ def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround FR64:$src))], - IIC_SSE_CVT_Scalar_RR>; + IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround (loadf64 addr:$src)))], IIC_SSE_CVT_Scalar_RM>, XD, - Requires<[UseSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>; + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>; + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, @@ -1782,13 +1794,15 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; } // Convert scalar single to scalar double @@ -1798,13 +1812,15 @@ def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, - XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG; + XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; + XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; } def : Pat<(f64 (fextend FR32:$src)), @@ -1823,12 +1839,12 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fextend FR32:$src))], IIC_SSE_CVT_Scalar_RR>, XS, - Requires<[UseSSE2]>; + Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))], IIC_SSE_CVT_Scalar_RM>, XS, - Requires<[UseSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; // extload f32 -> f64. This matches load+fextend because we have a hack in // the isel (PreprocessForFPConvert) that can introduce loads after dag @@ -1845,57 +1861,61 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>; + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>; + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; } // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>, VEX; + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX; + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 VR256:$src))], - IIC_SSE_CVT_PS_RR>, VEX, VEX_L; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX, VEX_L; + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>; + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>; + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; // Convert Packed Double FP to Packed DW Integers @@ -1906,7 +1926,7 @@ let Predicates = [HasAVX] in { def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, - VEX; + VEX, Sched<[WriteCvtF2I]>; // XMM only def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", @@ -1914,18 +1934,20 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX; + (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX, + Sched<[WriteCvtF2ILd]>; // YMM only def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L; + (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, + Sched<[WriteCvtF2I]>; def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>, - VEX, VEX_L; + VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; } @@ -1934,11 +1956,11 @@ def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>; + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>; + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; // Convert with truncation packed single/double fp to doubleword // SSE2 packed instructions with XS prefix @@ -1946,32 +1968,33 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>, VEX; + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX; + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], - IIC_SSE_CVT_PS_RR>, VEX, VEX_L; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX, VEX_L; + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + Sched<[WriteCvtF2ILd]>; def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>; + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>; + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), @@ -2021,7 +2044,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, VEX; + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. @@ -2034,19 +2057,19 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX; + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], - IIC_SSE_CVT_PD_RR>, VEX, VEX_L; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, VEX_L; + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; @@ -2060,12 +2083,13 @@ let Predicates = [HasAVX] in { def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>; + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>; + IIC_SSE_CVT_PD_RM>, + Sched<[WriteCvtF2ILd]>; // Convert packed single to packed double let Predicates = [HasAVX] in { @@ -2073,32 +2097,32 @@ let Predicates = [HasAVX] in { def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, VEX; + IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, VEX; + IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L; + IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L; + IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; } let Predicates = [UseSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, TB; + IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, TB; + IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>; } // Convert Packed DW Integers to Packed Double FP @@ -2106,30 +2130,33 @@ let Predicates = [HasAVX] in { let neverHasSideEffects = 1, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - []>, VEX; + []>, VEX, Sched<[WriteCvtI2FLd]>; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX; + (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX, + Sched<[WriteCvtI2F]>; def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtdq2_pd_256 - (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L; + (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L, + Sched<[WriteCvtI2FLd]>; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L; + (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L, + Sched<[WriteCvtI2F]>; } let neverHasSideEffects = 1, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>; + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], - IIC_SSE_CVT_PD_RM>; + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; // AVX 256-bit register conversion intrinsics let Predicates = [HasAVX] in { @@ -2146,7 +2173,7 @@ let Predicates = [HasAVX] in { def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], - IIC_SSE_CVT_PD_RR>, VEX; + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; // XMM only def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", @@ -2155,31 +2182,31 @@ def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2psx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX; + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; // YMM only def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], - IIC_SSE_CVT_PD_RR>, VEX, VEX_L; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, VEX_L; + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], - IIC_SSE_CVT_PD_RR>; + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>; + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; // AVX 256-bit register conversion intrinsics @@ -2244,11 +2271,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, let neverHasSideEffects = 1 in { def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], - IIC_SSE_ALU_F32S_RR>; + IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; let mayLoad = 1 in def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], - IIC_SSE_ALU_F32S_RM>; + IIC_SSE_ALU_F32S_RM>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -2314,65 +2342,62 @@ let Constraints = "$src1 = $dst" in { // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, - PatFrag ld_frag, string OpcodeStr, Domain d> { - def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + PatFrag ld_frag, string OpcodeStr> { + def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], - IIC_SSE_COMIS_RR, d>, + IIC_SSE_COMIS_RR>, Sched<[WriteFAdd]>; - def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), (ld_frag addr:$src2)))], - IIC_SSE_COMIS_RM, d>, + IIC_SSE_COMIS_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG; + "ucomiss">, TB, VEX, VEX_LIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, TB, OpSize, VEX, - VEX_LIG; + "ucomisd">, TB, OpSize, VEX, VEX_LIG; let Pattern = []<dag> in { defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB, VEX, - VEX_LIG; + "comiss">, TB, VEX, VEX_LIG; defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize, VEX, - VEX_LIG; + "comisd">, TB, OpSize, VEX, VEX_LIG; } defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, TB, VEX; + load, "ucomiss">, TB, VEX; defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; + load, "ucomisd">, TB, OpSize, VEX; defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, TB, VEX; + load, "comiss">, TB, VEX; defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, TB, OpSize, VEX; + load, "comisd">, TB, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, TB; + "ucomiss">, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, TB, OpSize; + "ucomisd">, TB, OpSize; let Pattern = []<dag> in { defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB; + "comiss">, TB; defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize; + "comisd">, TB, OpSize; } defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, TB; + load, "ucomiss">, TB; defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, TB, OpSize; + load, "ucomisd">, TB, OpSize; defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB; + "comiss">, TB; defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize; + "comisd">, TB, OpSize; } // Defs = [EFLAGS] // sse12_cmp_packed - sse 1 & 2 compare packed instructions @@ -2394,10 +2419,11 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, let neverHasSideEffects = 1 in { def rri_alt : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_CMPP_RR, d>; + asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>; def rmi_alt : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_CMPP_RM, d>; + asm_alt, [], IIC_SSE_CMPP_RM, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; } } @@ -2694,18 +2720,18 @@ let Predicates = [HasAVX] in { // Assembler Only def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedSingle>, TB, VEX; + SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>; def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, SSEPackedDouble>, TB, - OpSize, VEX; + OpSize, VEX, Sched<[WriteVecLogic]>; def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedSingle>, TB, VEX, VEX_L; + SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>; def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, SSEPackedDouble>, TB, - OpSize, VEX, VEX_L; + OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>; } defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", @@ -2817,8 +2843,8 @@ defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, SSE_BIT_ITINS_P>; -let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, +let isCommutable = 0 in + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, SSE_BIT_ITINS_P>; /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops @@ -3020,12 +3046,20 @@ let isCodeGenOnly = 1 in { /// And, we have a special variant form for a full-vector intrinsic form. let Sched = WriteFSqrt in { -def SSE_SQRTP : OpndItins< - IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM +def SSE_SQRTPS : OpndItins< + IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM +>; + +def SSE_SQRTSS : OpndItins< + IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM +>; + +def SSE_SQRTPD : OpndItins< + IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM >; -def SSE_SQRTS : OpndItins< - IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM +def SSE_SQRTSD : OpndItins< + IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM >; } @@ -3290,18 +3324,18 @@ let Predicates = [HasAVX] in { // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, - SSE_SQRTS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>, + SSE_SQRTSS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTS>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>; + SSE_SQRTSD>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTP>; + int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, @@ -3458,7 +3492,7 @@ def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), //===----------------------------------------------------------------------===// // Prefetch intrinsic. -let Predicates = [HasSSE1] in { +let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], IIC_SSE_PREFETCH>, TB; @@ -3473,6 +3507,8 @@ def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), IIC_SSE_PREFETCH>, TB; } +// FIXME: How should these memory instructions be modeled? +let SchedRW = [WriteLoad] in { // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], @@ -3492,6 +3528,7 @@ def LFENCE : I<0xAE, MRM_E8, (outs), (ins), def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, TB, Requires<[HasSSE2]>; +} // SchedRW def : Pat<(X86SFence), (SFENCE)>; def : Pat<(X86LFence), (LFENCE)>; @@ -3503,17 +3540,17 @@ def : Pat<(X86MFence), (MFENCE)>; def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>, VEX; + IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>, VEX; + IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>; + IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>; def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>; + IIC_SSE_STMXCSR>, Sched<[WriteStore]>; //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -4337,43 +4374,43 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int // -def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; -def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; -def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; -def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; -def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; @@ -4381,22 +4418,22 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), //===---------------------------------------------------------------------===// // Move Int Doubleword to Single Scalar // -def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), +def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), +def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; -def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), +def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; -def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), +def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; @@ -4404,23 +4441,23 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), //===---------------------------------------------------------------------===// // Move Packed Doubleword Int to Packed Double Int // -def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), +def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; -def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), +def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; -def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), +def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; -def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), +def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], @@ -4430,14 +4467,14 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), // Move Packed Doubleword Int first element to Doubleword Int // let SchedRW = [WriteMove] in { -def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "vmov{d|q}\t{$src, $dst|$dst, $src}", +def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, - TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>; + VEX; -def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), +def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], @@ -4452,11 +4489,11 @@ def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, VEX, Sched<[WriteLoad]>; -def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), +def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (bitconvert FR64:$src)), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; @@ -4465,11 +4502,11 @@ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; -def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; -def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), +def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (bitconvert FR64:$src)), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4477,33 +4514,34 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), //===---------------------------------------------------------------------===// // Move Scalar Single to Double Int // -def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), +def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))], - IIC_SSE_MOVD_ToGP>, VEX; -def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; +def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, VEX; -def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; +def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))], - IIC_SSE_MOVD_ToGP>; -def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; +def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)], - IIC_SSE_MOVDQ>; + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; //===---------------------------------------------------------------------===// // Patterns and instructions to describe movd/movq to XMM register zero-extends // +let SchedRW = [WriteMove] in { let AddedComplexity = 15 in { -def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def VMOVZDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))))], IIC_SSE_MOVDQ>, VEX; -def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))))], @@ -4511,32 +4549,33 @@ def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), VEX, VEX_W; } let AddedComplexity = 15 in { -def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def MOVZDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))))], IIC_SSE_MOVDQ>; -def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))))], IIC_SSE_MOVDQ>; } +} // SchedRW -let AddedComplexity = 20 in { -def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +let AddedComplexity = 20, SchedRW = [WriteLoad] in { +def VMOVZDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], IIC_SSE_MOVDQ>, VEX; -def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], IIC_SSE_MOVDQ>; -} +} // AddedComplexity, SchedRW let Predicates = [HasAVX] in { // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. @@ -4585,6 +4624,8 @@ def : InstAlias<"movq\t{$src, $dst|$dst, $src}", //===---------------------------------------------------------------------===// // Move Quadword Int to Packed Quadword Int // + +let SchedRW = [WriteLoad] in { def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4596,31 +4637,35 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (scalar_to_vector (loadi64 addr:$src))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix +} // SchedRW //===---------------------------------------------------------------------===// // Move Packed Quadword Int to Quadword Int // -def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +let SchedRW = [WriteStore] in { +def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX; -def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; +} // SchedRW //===---------------------------------------------------------------------===// // Store / copy lower 64-bits of a XMM register. // -def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; -def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX, + Sched<[WriteStore]>; +def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)], - IIC_SSE_MOVDQ>; + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; let AddedComplexity = 20 in def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), @@ -4629,7 +4674,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, VEX, Requires<[HasAVX]>; + XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>; let AddedComplexity = 20 in def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), @@ -4638,7 +4683,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, Requires<[UseSSE2]>; + XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; let Predicates = [HasAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), @@ -4668,6 +4713,7 @@ def : Pat<(v4i64 (X86vzload addr:$src)), // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. // +let SchedRW = [WriteVecLogic] in { let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -4680,7 +4726,9 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, XS, Requires<[UseSSE2]>; +} // SchedRW +let SchedRW = [WriteVecLogicLd] in { let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -4696,6 +4744,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; } +} // SchedRW let AddedComplexity = 20 in { let Predicates = [HasAVX] in { @@ -4713,26 +4762,30 @@ let AddedComplexity = 20 in { } // Instructions to match in the assembler -def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +let SchedRW = [WriteMove] in { +def VMOVQs64rr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, VEX_W; -def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), +def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, VEX_W; // Recognize "movd" with GR64 destination, but encode as a "movq" -def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), +def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, VEX_W; +} // SchedRW // Instructions for the disassembler // xr = XMM register // xm = mem64 +let SchedRW = [WriteMove] in { let Predicates = [HasAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS; +} // SchedRW //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP @@ -4743,11 +4796,11 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (vt (OpNode RC:$src)))], - IIC_SSE_MOV_LH>; + IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (OpNode (mem_frag addr:$src)))], - IIC_SSE_MOV_LH>; + IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; } let Predicates = [HasAVX] in { @@ -4803,25 +4856,27 @@ multiclass sse3_replicate_dfp<string OpcodeStr> { let neverHasSideEffects = 1 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [], IIC_SSE_MOV_LH>; + [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (v2f64 (X86Movddup (scalar_to_vector (loadf64 addr:$src)))))], - IIC_SSE_MOV_LH>; + IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>; } // FIXME: Merge with above classe when there're patterns for the ymm version multiclass sse3_replicate_dfp_y<string OpcodeStr> { def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>; + [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, + Sched<[WriteShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>; + (scalar_to_vector (loadf64 addr:$src)))))]>, + Sched<[WriteShuffleLd]>; } let Predicates = [HasAVX] in { @@ -4869,6 +4924,7 @@ let Predicates = [UseSSE3] in { // SSE3 - Move Unaligned Integer //===---------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", @@ -4882,6 +4938,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], IIC_SSE_LDDQU>; +} //===---------------------------------------------------------------------===// // SSE3 - Arithmetic @@ -4895,13 +4952,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>; + [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; def rm : I<0xD0, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>; + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { @@ -4938,14 +4997,15 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], - IIC_SSE_HADDSUB_RM>; + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { @@ -4953,14 +5013,15 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], - IIC_SSE_HADDSUB_RM>; + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { @@ -5009,7 +5070,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, - OpSize; + OpSize, Sched<[WriteVecALU]>; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -5017,7 +5078,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (IntId128 (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, - OpSize; + OpSize, Sched<[WriteVecALULd]>; } /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. @@ -5027,16 +5088,27 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 VR256:$src))]>, - OpSize; + OpSize, Sched<[WriteVecALU]>; def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 - (bitconvert (memopv4i64 addr:$src))))]>, OpSize; + (bitconvert (memopv4i64 addr:$src))))]>, OpSize, + Sched<[WriteVecALULd]>; } +// Helper fragments to match sext vXi1 to vXiY. +def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), + VR128:$src))>; +def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>; +def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>; +def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), + VR256:$src))>; +def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>; +def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>; + let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128>, VEX; @@ -5044,6 +5116,19 @@ let Predicates = [HasAVX] in { int_x86_ssse3_pabs_w_128>, VEX; defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128>, VEX; + + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (VPABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (VPABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (VPABSDrr128 VR128:$src)>; } let Predicates = [HasAVX2] in { @@ -5053,6 +5138,19 @@ let Predicates = [HasAVX2] in { int_x86_avx2_pabs_w>, VEX, VEX_L; defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", int_x86_avx2_pabs_d>, VEX, VEX_L; + + def : Pat<(xor + (bc_v4i64 (v32i1sextv32i8)), + (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), + (VPABSBrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v16i1sextv16i16)), + (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), + (VPABSWrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v8i1sextv8i32)), + (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), + (VPABSDrr256 VR256:$src)>; } defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", @@ -5062,10 +5160,26 @@ defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128>; +let Predicates = [HasSSSE3] in { + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (PABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (PABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (PABSDrr128 VR128:$src)>; +} + //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions //===---------------------------------------------------------------------===// +let Sched = WriteVecALU in { def SSE_PHADDSUBD : OpndItins< IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM >; @@ -5075,12 +5189,16 @@ def SSE_PHADDSUBSW : OpndItins< def SSE_PHADDSUBW : OpndItins< IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM >; +} +let Sched = WriteShuffle in def SSE_PSHUFB : OpndItins< IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM >; +let Sched = WriteVecALU in def SSE_PSIGN : OpndItins< IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM >; +let Sched = WriteVecIMul in def SSE_PMULHRSW : OpndItins< IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW >; @@ -5097,7 +5215,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, - OpSize; + OpSize, Sched<[itins.Sched]>; def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, @@ -5105,7 +5223,8 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize; + (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. @@ -5119,7 +5238,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, - OpSize; + OpSize, Sched<[itins.Sched]>; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, @@ -5127,7 +5246,8 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, @@ -5269,7 +5389,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNR>, OpSize; + [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>; let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), @@ -5277,7 +5397,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNR>, OpSize; + [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; } } @@ -5287,13 +5407,13 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { (ins VR256:$src1, VR256:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; + []>, OpSize, Sched<[WriteShuffle]>; let mayLoad = 1 in def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; + []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; } } @@ -5341,6 +5461,7 @@ def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), // SSSE3 - Thread synchronization //===---------------------------------------------------------------------===// +let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, @@ -5354,13 +5475,14 @@ let Uses = [ECX, EAX] in def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>, TB, Requires<[HasSSE3]>; +} // SchedRW -def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; -def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; -def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, Requires<[In32BitMode]>; -def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// @@ -6017,11 +6139,11 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { } let ExeDomain = SSEPackedSingle in { - let Predicates = [HasAVX] in { + let Predicates = [UseAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), - "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX; } defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; @@ -6773,7 +6895,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], - IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), @@ -6782,7 +6904,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), RC:$src3))], - IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; } let Predicates = [HasAVX] in { @@ -6894,17 +7016,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, int_x86_sse41_pblendvb>; // Aliases with the implicit xmm0 argument -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; let Predicates = [UseSSE41] in { @@ -7144,62 +7266,62 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { let Constraints = "$src1 = $dst" in { def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_8 GR32:$src1, (load addr:$src2)))]>; def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i16mem:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", + "crc32{w}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_16 GR32:$src1, (load addr:$src2)))]>, OpSize; def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR16:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", + "crc32{w}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, OpSize; def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", + "crc32{l}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_32 GR32:$src1, (load addr:$src2)))]>; def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", + "crc32{l}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_8 GR64:$src1, (load addr:$src2)))]>, REX_W; def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, REX_W; def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", + "crc32{q}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_64 GR64:$src1, (load addr:$src2)))]>, REX_W; def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", + "crc32{q}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, REX_W; @@ -7464,62 +7586,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX] in { -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (bc_v4i32 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (bc_v16i8 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (bc_v8i16 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -7539,59 +7661,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), // AVX1 patterns let Predicates = [HasAVX] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4f32 (VEXTRACTF128rr (v8f32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2f64 (VEXTRACTF128rr (v4f64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), +def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTF128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTF128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTF128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTF128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -8069,42 +8191,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX2] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (bc_v4i32 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (bc_v16i8 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (bc_v8i16 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -8123,39 +8245,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), VEX, VEX_L; let Predicates = [HasAVX2] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTI128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTI128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTI128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTI128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -8250,7 +8372,9 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, []>, VEX_4VOp3, VEX_L; } -let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in { +let mayLoad = 1, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td index 757dcd0..0191c01 100644 --- a/lib/Target/X86/X86InstrSVM.td +++ b/lib/Target/X86/X86InstrSVM.td @@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; // 0F 01 DE let Uses = [EAX] in -def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB; +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; // 0F 01 D8 let Uses = [EAX] in def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DA let Uses = [EAX] in def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DB let Uses = [EAX] in def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DF let Uses = [EAX, ECX] in def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>; + "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX, ECX] in def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>; + "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index 1185941..1937770 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -15,26 +15,26 @@ let Defs = [EFLAGS] in { -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize; def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>; def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; - + let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", @@ -43,7 +43,7 @@ def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "shl{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>; -def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), +def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], @@ -62,24 +62,25 @@ def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), "shl{q}\t$dst", [], IIC_SR>; } // hasSideEffects = 0 } // isConvertibleToThreeAddress = 1 -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { // FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern // using CL? let Uses = [CL] in { def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -118,20 +119,21 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t$dst", [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)], IIC_SR>; +} // SchedRW -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize; def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>; def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } @@ -163,22 +165,23 @@ def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), "shr{q}\t$dst", [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>; -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -216,23 +219,24 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t$dst", [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)], IIC_SR>; +} // SchedRW -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (sra GR8:$src1, CL))], IIC_SR>; def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (sra GR16:$src1, CL))], IIC_SR>, OpSize; def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (sra GR32:$src1, CL))], IIC_SR>; def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (sra GR64:$src1, CL))], IIC_SR>; } @@ -273,24 +277,25 @@ def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), "sar{q}\t$dst", [(set GR64:$dst, (sra GR64:$src1, (i8 1)))], IIC_SR>; -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -330,20 +335,21 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t$dst", [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)], IIC_SR>; +} // SchedRW //===----------------------------------------------------------------------===// // Rotate instructions //===----------------------------------------------------------------------===// let hasSideEffects = 0 in { -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize; @@ -351,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>; @@ -359,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -368,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -377,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize; @@ -385,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>; @@ -393,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; @@ -401,10 +407,11 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } // Constraints = "$src = $dst" +let SchedRW = [WriteShiftLd, WriteRMW] in { def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t$dst", [], IIC_SR>; def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), @@ -441,39 +448,40 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), let Uses = [CL] in { def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } +} // SchedRW } // hasSideEffects = 0 -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { // FIXME: provide shorter instructions when imm8 == 1 let Uses = [CL] in { def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize; def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>; def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), - "rol{q}\t{%cl, $dst|$dst, CL}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } @@ -512,23 +520,24 @@ def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "rol{q}\t$dst", [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))], IIC_SR>; -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), - "rol{q}\t{%cl, $dst|$dst, %cl}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -568,20 +577,21 @@ def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), "rol{q}\t$dst", [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)], IIC_SR>; +} // SchedRW -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize; def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>; def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } @@ -620,23 +630,24 @@ def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "ror{q}\t$dst", [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))], IIC_SR>; -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -676,46 +687,47 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)], IIC_SR>; +} // SchedRW //===----------------------------------------------------------------------===// // Double shift instructions (generalizations of rotate) //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; @@ -765,33 +777,34 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; } -} // Constraints = "$src = $dst" +} // Constraints = "$src = $dst", SchedRW +let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; } @@ -840,6 +853,7 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; +} // SchedRW } // Defs = [EFLAGS] @@ -857,12 +871,12 @@ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { let neverHasSideEffects = 1 in { def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, TAXD, VEX; + []>, TAXD, VEX, Sched<[WriteShift]>; let mayLoad = 1 in def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, i8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, TAXD, VEX; + []>, TAXD, VEX, Sched<[WriteShiftLd]>; } } @@ -870,11 +884,17 @@ multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { let neverHasSideEffects = 1 in { def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - VEX_4VOp3; + VEX_4VOp3, Sched<[WriteShift]>; let mayLoad = 1 in def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - VEX_4VOp3; + VEX_4VOp3, + Sched<[WriteShiftLd, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src1 + ReadAfterLd]>; } } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 3caa1b5..2196dc3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +let SchedRW = [WriteSystem] in { let Defs = [RAX, RDX] in def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>, TB; @@ -35,6 +36,7 @@ let Uses = [EFLAGS] in def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))], IIC_INT3>; +} // SchedRW def : Pat<(debugtrap), (INT3)>; @@ -43,6 +45,7 @@ def : Pat<(debugtrap), // FIXME: This doesn't work because InstAlias can't match immediate constants. //def : InstAlias<"int\t$3", (INT3)>; +let SchedRW = [WriteSystem] in { def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)], IIC_INT>; @@ -65,58 +68,62 @@ def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize; def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>; def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, Requires<[In64BitMode]>; +} // SchedRW //===----------------------------------------------------------------------===// // Input/Output Instructions. // +let SchedRW = [WriteSystem] in { let Defs = [AL], Uses = [DX] in def IN8rr : I<0xEC, RawFrm, (outs), (ins), - "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>; + "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>; let Defs = [AX], Uses = [DX] in def IN16rr : I<0xED, RawFrm, (outs), (ins), - "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize; + "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize; let Defs = [EAX], Uses = [DX] in def IN32rr : I<0xED, RawFrm, (outs), (ins), - "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>; + "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>; let Defs = [AL] in def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), - "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>; + "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize; + "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize; let Defs = [EAX] in def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>; + "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>; let Uses = [DX, AL] in def OUT8rr : I<0xEE, RawFrm, (outs), (ins), - "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>; + "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>; let Uses = [DX, AX] in def OUT16rr : I<0xEF, RawFrm, (outs), (ins), - "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize; + "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize; let Uses = [DX, EAX] in def OUT32rr : I<0xEF, RawFrm, (outs), (ins), - "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>; + "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>; let Uses = [AL] in def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), - "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>; + "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize; + "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize; let Uses = [EAX] in def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>; + "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>; def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>; def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize; def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>; +} // SchedRW //===----------------------------------------------------------------------===// // Moves to and from debug registers +let SchedRW = [WriteSystem] in { def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB; def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), @@ -126,10 +133,12 @@ def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // Moves to and from control registers +let SchedRW = [WriteSystem] in { def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB; def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), @@ -139,6 +148,7 @@ def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // Segment override instruction prefixes @@ -155,6 +165,7 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; // Moves to and from segment registers. // +let SchedRW = [WriteMove] in { def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize; def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), @@ -182,10 +193,12 @@ def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; +} // SchedRW //===----------------------------------------------------------------------===// // Segmentation support instructions. +let SchedRW = [WriteSystem] in { def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), @@ -235,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), - "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), - "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; + "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), - "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), - "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), - "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), - "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHES16 : I<0x06, RawFrm, (outs), (ins), - "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHES32 : I<0x06, RawFrm, (outs), (ins), - "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), - "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), - "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), - "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), - "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), - "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB; def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), - "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB; // No "pop cs" instruction. def POPSS16 : I<0x17, RawFrm, (outs), (ins), - "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>, OpSize, Requires<[In32BitMode]>; def POPSS32 : I<0x17, RawFrm, (outs), (ins), - "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>, Requires<[In32BitMode]>; def POPDS16 : I<0x1F, RawFrm, (outs), (ins), - "pop{w}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{w}\t{%ds|ds}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPDS32 : I<0x1F, RawFrm, (outs), (ins), - "pop{l}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{l}\t{%ds|ds}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPES16 : I<0x07, RawFrm, (outs), (ins), - "pop{w}\t{%es|ES}", [], IIC_POP_SR>, + "pop{w}\t{%es|es}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPES32 : I<0x07, RawFrm, (outs), (ins), - "pop{l}\t{%es|ES}", [], IIC_POP_SR>, + "pop{l}\t{%es|es}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPFS16 : I<0xa1, RawFrm, (outs), (ins), - "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB; def POPFS32 : I<0xa1, RawFrm, (outs), (ins), - "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPFS64 : I<0xa1, RawFrm, (outs), (ins), - "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB; def POPGS16 : I<0xa9, RawFrm, (outs), (ins), - "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB; def POPGS32 : I<0xa9, RawFrm, (outs), (ins), - "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPGS64 : I<0xa9, RawFrm, (outs), (ins), - "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB; def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), @@ -347,10 +360,12 @@ def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", [], IIC_VERW_MEM>, TB; def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", [], IIC_VERW_REG>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // Descriptor-table support instructions +let SchedRW = [WriteSystem] in { def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>; def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), @@ -385,9 +400,11 @@ def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), "lldt{w}\t$src", [], IIC_LLDT_REG>, TB; def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB; - +} // SchedRW + //===----------------------------------------------------------------------===// // Specialized register support +let SchedRW = [WriteSystem] in { def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; @@ -410,14 +427,18 @@ def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // Cache instructions +let SchedRW = [WriteSystem] in { def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB; def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // XSAVE instructions +let SchedRW = [WriteSystem] in { let Defs = [RDX, RAX], Uses = [RCX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; @@ -428,16 +449,17 @@ let Uses = [RDX, RAX] in { def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), "xsave\t$dst", []>, TB; def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + "xsave{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), "xrstor\t$dst", []>, TB; def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + "xrstor{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), "xsaveopt\t$dst", []>, TB; def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + "xsaveopt{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; } +} // SchedRW //===----------------------------------------------------------------------===// // VIA PadLock crypto instructions @@ -493,8 +515,15 @@ let Predicates = [HasFSGSBase, In64BitMode] in { //===----------------------------------------------------------------------===// // INVPCID Instruction def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In32BitMode]>; def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SMAP Instruction +let Defs = [EFLAGS], Uses = [EFLAGS] in { + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; +} diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index a37a8cc..59a6f1e 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -15,6 +15,9 @@ //===----------------------------------------------------------------------===// // TSX instructions +def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + let usesCustomInserter = 1 in def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, @@ -27,6 +30,17 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst), def XEND : I<0x01, MRM_D5, (outs), (ins), "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; +let Defs = [EFLAGS] in +def XTEST : I<0x01, MRM_D6, (outs), (ins), + "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>; + def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + +// HLE prefixes + +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>; + +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>; + diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index 44d8cce..fc86e1e 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -339,6 +339,7 @@ extern "C" { /// must locate the start of the stub or call site and pass it into the JIT /// compiler function. extern "C" { +LLVM_ATTRIBUTE_USED // Referenced from inline asm. LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { intptr_t *RetAddrLoc = &StackPtr[1]; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index a8a9fd8..c7c00b5 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -225,32 +225,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, } - -static void lower_subreg32(MCInst *MI, unsigned OpNo) { - // Convert registers in the addr mode according to subreg32. - unsigned Reg = MI->getOperand(OpNo).getReg(); - if (Reg != 0) - MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32)); -} - -static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) { - // Convert registers in the addr mode according to subreg64. - for (unsigned i = 0; i != 4; ++i) { - if (!MI->getOperand(OpNo+i).isReg()) continue; - - unsigned Reg = MI->getOperand(OpNo+i).getReg(); - // LEAs can use RIP-relative addressing, and RIP has no sub/super register. - if (Reg == 0 || Reg == X86::RIP) continue; - - MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64)); - } -} - -/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8. -static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) { - OutMI.setOpcode(NewOpc); - lower_subreg32(&OutMI, 0); -} /// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) { OutMI.setOpcode(NewOpc); @@ -280,6 +254,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { Inst.addOperand(Saved); } +/// \brief If a movsx instruction has a shorter encoding for the used register +/// simplify the instruction to use it instead. +static void SimplifyMOVSX(MCInst &Inst) { + unsigned NewOpcode = 0; + unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg(); + switch (Inst.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw + if (Op0 == X86::AX && Op1 == X86::AL) + NewOpcode = X86::CBW; + break; + case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl + if (Op0 == X86::EAX && Op1 == X86::AX) + NewOpcode = X86::CWDE; + break; + case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq + if (Op0 == X86::RAX && Op1 == X86::EAX) + NewOpcode = X86::CDQE; + break; + } + + if (NewOpcode != 0) { + Inst = MCInst(); + Inst.setOpcode(NewOpcode); + } +} + /// \brief Simplify things like MOV32rm to MOV32o32a. static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, unsigned Opcode) { @@ -376,9 +378,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { // Handle a few special cases to eliminate operand modifiers. ReSimplify: switch (OutMI.getOpcode()) { - case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand. - lower_lea64_32mem(&OutMI, 1); - // FALL THROUGH. + case X86::LEA64_32r: case X86::LEA64r: case X86::LEA16r: case X86::LEA32r: @@ -388,23 +388,10 @@ ReSimplify: assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && "LEA has segment specified!"); break; - case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; - case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; - case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; - case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; - case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; - case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break; - case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break; - case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::MOV16r0: - LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 - LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr - break; - case X86::MOV64r0: - LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0 - LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + case X86::MOV32ri64: + OutMI.setOpcode(X86::MOV32ri); break; // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B @@ -598,6 +585,13 @@ ReSimplify: case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + // Try to shrink some forms of movsx. + case X86::MOVSX16rr8: + case X86::MOVSX32rr16: + case X86::MOVSX64rr32: + SimplifyMOVSX(OutMI); + break; + case X86::MORESTACK_RET: OutMI.setOpcode(X86::RET); break; @@ -695,13 +689,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(Mang, *MF, *this); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - std::string TmpStr; - raw_string_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; + llvm_unreachable("Should be handled target independently"); // Emit nothing here but a comment if we can. case X86::Int_MemBarrier: diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 16886e4..0923310 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -54,15 +54,14 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, - const TargetInstrInfo &tii) +X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), (tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::RIP : X86::EIP)), - TM(tm), TII(tii) { + TM(tm) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. @@ -242,6 +241,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::Intel_OCL_BI: { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) @@ -276,8 +280,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX512) + return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; + if (Is64Bit && HasAVX512) + return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (IsWin64 && HasAVX) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (Is64Bit && HasAVX) @@ -306,19 +315,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); // Set the stack-pointer register and its aliases as reserved. - Reserved.set(X86::RSP); - for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); // Set the instruction pointer register and its aliases as reserved. - Reserved.set(X86::RIP); - for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { - Reserved.set(X86::RBP); - for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); } @@ -331,8 +340,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - Reserved.set(getBaseRegister()); - for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I) + for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true); + I.isValid(); ++I) Reserved.set(*I); } @@ -345,14 +354,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::GS); // Mark the floating point stack registers as reserved. - Reserved.set(X86::ST0); - Reserved.set(X86::ST1); - Reserved.set(X86::ST2); - Reserved.set(X86::ST3); - Reserved.set(X86::ST4); - Reserved.set(X86::ST5); - Reserved.set(X86::ST6); - Reserved.set(X86::ST7); + for (unsigned n = 0; n != 8; ++n) + Reserved.set(X86::ST0 + n); // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { @@ -365,19 +368,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... - static const uint16_t GPR64[] = { - X86::R8, X86::R9, X86::R10, X86::R11, - X86::R12, X86::R13, X86::R14, X86::R15 - }; - for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI) + for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); // XMM8, XMM9, ... - assert(X86::XMM15 == X86::XMM8+7); for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } + if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) { + for (unsigned n = 16; n != 32; ++n) { + for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } return Reserved; } @@ -407,10 +411,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + return false; + const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - if (!MF.getTarget().Options.RealignStack) - return false; // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. @@ -688,4 +693,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } } + +unsigned get512BitSuperRegister(unsigned Reg) { + if (Reg >= X86::XMM0 && Reg <= X86::XMM31) + return X86::ZMM0 + (Reg - X86::XMM0); + if (Reg >= X86::YMM0 && Reg <= X86::YMM31) + return X86::ZMM0 + (Reg - X86::YMM0); + if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) + return Reg; + llvm_unreachable("Unexpected SIMD register"); +} + } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index b9d7b8c..fb17682 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -27,7 +27,6 @@ namespace llvm { class X86RegisterInfo : public X86GenRegisterInfo { public: X86TargetMachine &TM; - const TargetInstrInfo &TII; private: /// Is64Bit - Is the target 64-bits. @@ -56,7 +55,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); + X86RegisterInfo(X86TargetMachine &tm); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -138,6 +137,9 @@ public: // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); +//get512BitRegister - X86 utility - returns 512-bit super register +unsigned get512BitSuperRegister(unsigned Reg); + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index be6282a..b802728 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -21,11 +21,12 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> // Subregister indices. let Namespace = "X86" in { - def sub_8bit : SubRegIndex; - def sub_8bit_hi : SubRegIndex; - def sub_16bit : SubRegIndex; - def sub_32bit : SubRegIndex; - def sub_xmm : SubRegIndex; + def sub_8bit : SubRegIndex<8>; + def sub_8bit_hi : SubRegIndex<8, 8>; + def sub_16bit : SubRegIndex<16>; + def sub_32bit : SubRegIndex<32>; + def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; } //===----------------------------------------------------------------------===// @@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; + } // CostPerUse -// YMM Registers, used by AVX instructions +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { -def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>; -def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>; -def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>; -def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>; -def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>; -def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>; -def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>; -def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>; -def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>; -def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>; -def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>; -def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>; -def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>; -def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>; -def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>; -def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>; + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } } + // Mask Registers, used by AVX-512 instructions. + def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; + def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; + def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; + def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; + def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; + def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; + def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; + def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; + class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { let Aliases = A; } @@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, + (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 31)>; + +def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>; +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>; + +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>; +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; + diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td new file mode 100644 index 0000000..62ba2bc --- /dev/null +++ b/lib/Target/X86/X86SchedHaswell.td @@ -0,0 +1,132 @@ +//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Haswell to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def HaswellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 4; + let MispredictPenalty = 16; +} + +let SchedModel = HaswellModel in { + +// Haswell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, 6 and 7 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def HWPort0 : ProcResource<1>; +def HWPort1 : ProcResource<1>; +def HWPort2 : ProcResource<1>; +def HWPort3 : ProcResource<1>; +def HWPort4 : ProcResource<1>; +def HWPort5 : ProcResource<1>; +def HWPort6 : ProcResource<1>; +def HWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; +def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; +def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; +def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; + +// 60 Entry Unified Scheduler +def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, + HWPort5, HWPort6, HWPort7]> { + let BufferSize=60; +} + +// Integer division issued on port 0. +def HWDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [HWPort4]>; + +def : WriteRes<WriteStore, [HWPort237, HWPort4]>; +def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [HWPort0156]>; +def : WriteRes<WriteZero, []>; + +defm : HWWriteResPair<WriteALU, HWPort0156, 1>; +defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } +defm : HWWriteResPair<WriteShift, HWPort056, 1>; +defm : HWWriteResPair<WriteJump, HWPort5, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [HWPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; +defm : HWWriteResPair<WriteFMul, HWPort0, 5>; +defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. +defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; +defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; +defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; +defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; +defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; + +// Vector integer operations. +defm : HWWriteResPair<WriteVecShift, HWPort05, 1>; +defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>; +defm : HWWriteResPair<WriteVecALU, HWPort15, 1>; +defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>; +defm : HWWriteResPair<WriteShuffle, HWPort15, 1>; + +def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } +} // SchedModel diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td new file mode 100644 index 0000000..52ead94 --- /dev/null +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -0,0 +1,127 @@ +//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Sandy Bridge to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SandyBridgeModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SB can decode 4 + // instructions per cycle. + // FIXME: Identify instructions that aren't a single fused micro-op. + let IssueWidth = 4; + let MicroOpBufferSize = 168; // Based on the reorder buffer. + let LoadLatency = 4; + let MispredictPenalty = 16; +} + +let SchedModel = SandyBridgeModel in { + +// Sandy Bridge can issue micro-ops to 6 different ports in one cycle. + +// Ports 0, 1, and 5 handle all computation. +def SBPort0 : ProcResource<1>; +def SBPort1 : ProcResource<1>; +def SBPort5 : ProcResource<1>; + +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. +def SBPort23 : ProcResource<2>; + +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +def SBPort4 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; +def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; +def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; + +// 54 Entry Unified Scheduler +def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> { + let BufferSize=54; +} + +// Integer division issued on port 0. +def SBDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SBPort4]>; + +def : WriteRes<WriteStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [SBPort015]>; +def : WriteRes<WriteZero, []>; + +defm : SBWriteResPair<WriteALU, SBPort015, 1>; +defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } +defm : SBWriteResPair<WriteShift, SBPort05, 1>; +defm : SBWriteResPair<WriteJump, SBPort5, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [SBPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; +defm : SBWriteResPair<WriteFMul, SBPort0, 5>; +defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. +defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; +defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; +defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; +defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; +defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; + +// Vector integer operations. +defm : SBWriteResPair<WriteVecShift, SBPort05, 1>; +defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>; +defm : SBWriteResPair<WriteVecALU, SBPort15, 1>; +defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; +defm : SBWriteResPair<WriteShuffle, SBPort15, 1>; + +def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } +} // SchedModel diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index da0ca7d..ceb2e05 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -42,6 +42,7 @@ multiclass X86SchedWritePair { // Arithmetic. defm WriteALU : X86SchedWritePair; // Simple integer ALU op. defm WriteIMul : X86SchedWritePair; // Integer multiplication. +def WriteIMulH : SchedWrite; // Integer multiplication, high part. defm WriteIDiv : X86SchedWritePair; // Integer division. def WriteLEA : SchedWrite; // LEA instructions can't fold loads. @@ -53,6 +54,10 @@ def WriteLoad : SchedWrite; def WriteStore : SchedWrite; def WriteMove : SchedWrite; +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def WriteZero : SchedWrite; + // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. defm WriteJump : X86SchedWritePair; @@ -63,6 +68,10 @@ defm WriteFMul : X86SchedWritePair; // Floating point multiplication. defm WriteFDiv : X86SchedWritePair; // Floating point division. defm WriteFSqrt : X86SchedWritePair; // Floating point square root. defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal. +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. + +// FMA Scheduling helper class. +class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } // Vector integer operations. defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. @@ -79,9 +88,14 @@ defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion. +// Catch-all for expensive system instructions. +def WriteSystem : SchedWrite; + +// Old microcoded instructions that nobody use. +def WriteMicrocoded : SchedWrite; + //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for X86 -def IIC_DEFAULT : InstrItinClass; def IIC_ALU_MEM : InstrItinClass; def IIC_ALU_NONMEM : InstrItinClass; def IIC_LEA : InstrItinClass; @@ -253,10 +267,14 @@ def IIC_SSE_PINSRW : InstrItinClass; def IIC_SSE_PABS_RR : InstrItinClass; def IIC_SSE_PABS_RM : InstrItinClass; -def IIC_SSE_SQRTP_RR : InstrItinClass; -def IIC_SSE_SQRTP_RM : InstrItinClass; -def IIC_SSE_SQRTS_RR : InstrItinClass; -def IIC_SSE_SQRTS_RM : InstrItinClass; +def IIC_SSE_SQRTPS_RR : InstrItinClass; +def IIC_SSE_SQRTPS_RM : InstrItinClass; +def IIC_SSE_SQRTSS_RR : InstrItinClass; +def IIC_SSE_SQRTSS_RM : InstrItinClass; +def IIC_SSE_SQRTPD_RR : InstrItinClass; +def IIC_SSE_SQRTPD_RM : InstrItinClass; +def IIC_SSE_SQRTSD_RR : InstrItinClass; +def IIC_SSE_SQRTSD_RM : InstrItinClass; def IIC_SSE_RCPP_RR : InstrItinClass; def IIC_SSE_RCPP_RM : InstrItinClass; @@ -533,8 +551,9 @@ def IIC_NOP : InstrItinClass; // Resources beyond the decoder operate on micro-ops and are bufferred // so adjacent micro-ops don't directly compete. // -// MinLatency=0 indicates that RAW dependencies can be decoded in the -// same cycle. +// MicroOpBufferSize > 1 indicates that RAW dependencies can be +// decoded in the same cycle. The value 32 is a reasonably arbitrary +// number of in-flight instructions. // // HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef // indicates high latency opcodes. Alternatively, InstrItinData @@ -542,17 +561,14 @@ def IIC_NOP : InstrItinClass; // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// ILPWindow=10 is an arbitrary threshold that approximates cycles of -// latency hidden by instruction buffers. The actual value is not very -// important but should be zero for inorder and nonzero for OOO processors. -// // The GenericModel contains no instruciton itineraries. def GenericModel : SchedMachineModel { let IssueWidth = 4; - let MinLatency = 0; + let MicroOpBufferSize = 32; let LoadLatency = 4; let HighLatency = 10; - let ILPWindow = 10; } include "X86ScheduleAtom.td" +include "X86SchedSandyBridge.td" +include "X86SchedHaswell.td" diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 1e5f2d6..14a1471 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -33,7 +33,6 @@ def AtomItineraries : ProcessorItineraries< // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >, // // Default is 1 cycle, port0 or port1 - InstrItinData<IIC_DEFAULT, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >, @@ -212,10 +211,15 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, - InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, @@ -521,11 +525,9 @@ def AtomItineraries : ProcessorItineraries< // Atom machine model. def AtomModel : SchedMachineModel { let IssueWidth = 2; // Allows 2 instructions per scheduling group. - let MinLatency = 1; // InstrStage cycles overrides MinLatency. - // OperandCycles may be used for expected latency. + let MicroOpBufferSize = 0; // In-order execution, always hide latency. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. - let ILPWindow = 0; // Always try to hide expected latency. let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index f934fdd..d1db79f 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -27,7 +27,7 @@ X86SelectionDAGInfo::~X86SelectionDAGInfo() { } SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, +X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -175,7 +175,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, } SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, +X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index d1d66fe..d728af5 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -34,7 +34,7 @@ public: ~X86SelectionDAGInfo(); virtual - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -42,7 +42,7 @@ public: MachinePointerInfo DstPtrInfo) const; virtual - SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 0f2c008..fae90f2 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -37,8 +37,7 @@ using namespace llvm; /// ClassifyBlockAddressReference - Classify a blockaddress reference for the /// current subtarget according to how we should reference it in a non-pcrel /// context. -unsigned char X86Subtarget:: -ClassifyBlockAddressReference() const { +unsigned char X86Subtarget::ClassifyBlockAddressReference() const { if (isPICStyleGOT()) // 32-bit ELF targets. return X86II::MO_GOTOFF; @@ -171,6 +170,26 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } +static bool OSHasAVXSupport() { +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) +#if defined(__GNUC__) + // Check xgetbv; this uses a .byte sequence instead of the instruction + // directly because older assemblers do not include support for xgetbv and + // there is no easy way to conditionally compile based on the assembler used. + int rEAX, rEDX; + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); +#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) + unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); +#else + int rEAX = 0; // Ensures we return false +#endif + return (rEAX & 6) == 6; +#else + return false; +#endif +} + void X86Subtarget::AutoDetectSubtargetFeatures() { unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; unsigned MaxLevel; @@ -193,7 +212,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);} if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} - if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); } + if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) { + X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); + } bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; @@ -283,6 +304,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasLZCNT = true; ToggleFeature(X86::FeatureLZCNT); } + if (IsIntel && ((ECX >> 8) & 0x1)) { + HasPRFCHW = true; + ToggleFeature(X86::FeaturePRFCHW); + } if (IsAMD) { if ((ECX >> 6) & 0x1) { HasSSE4A = true; @@ -310,6 +335,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasBMI = true; ToggleFeature(X86::FeatureBMI); } + if ((EBX >> 4) & 0x1) { + HasHLE = true; + ToggleFeature(X86::FeatureHLE); + } if (IsIntel && ((EBX >> 5) & 0x1)) { X86SSELevel = AVX2; ToggleFeature(X86::FeatureAVX2); @@ -322,6 +351,14 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasRTM = true; ToggleFeature(X86::FeatureRTM); } + if (IsIntel && ((EBX >> 19) & 0x1)) { + HasADX = true; + ToggleFeature(X86::FeatureADX); + } + if (IsIntel && ((EBX >> 18) & 0x1)) { + HasRDSEED = true; + ToggleFeature(X86::FeatureRDSEED); + } } } } @@ -439,7 +476,13 @@ void X86Subtarget::initializeEnvironment() { HasBMI = false; HasBMI2 = false; HasRTM = false; + HasHLE = false; + HasERI = false; + HasCDI = false; + HasPFI=false; HasADX = false; + HasPRFCHW = false; + HasRDSEED = false; IsBTMemSlow = false; IsUAMemFast = false; HasVectorUAMem = false; @@ -448,6 +491,8 @@ void X86Subtarget::initializeEnvironment() { HasSlowDivide = false; PostRAScheduler = false; PadShortFunctions = false; + CallRegIndirect = false; + LEAUsesAG = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index e97da4b..8793238 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -42,7 +42,7 @@ enum Style { class X86Subtarget : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512 }; enum X863DNowEnum { @@ -121,9 +121,18 @@ protected: /// HasRTM - Processor has RTM instructions. bool HasRTM; + /// HasHLE - Processor has HLE. + bool HasHLE; + /// HasADX - Processor has ADX instructions. bool HasADX; + /// HasPRFCHW - Processor has PRFCHW instructions. + bool HasPRFCHW; + + /// HasRDSEED - Processor has RDSEED instructions. + bool HasRDSEED; + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -153,6 +162,22 @@ protected: /// a stall when returning too early. bool PadShortFunctions; + /// CallRegIndirect - True if the Calls with memory reference should be converted + /// to a register-based indirect call. + bool CallRegIndirect; + /// LEAUsesAG - True if the LEA instruction inputs have to be ready at + /// address generation (AG) time. + bool LEAUsesAG; + + /// Processor has AVX-512 PreFetch Instructions + bool HasPFI; + + /// Processor has AVX-512 Exponential and Reciprocal Instructions + bool HasERI; + + /// Processor has AVX-512 Conflict Detection Instructions + bool HasCDI; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -233,6 +258,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasAVX512() const { return X86SSELevel >= AVX512; } bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } @@ -253,7 +279,10 @@ public: bool hasBMI() const { return HasBMI; } bool hasBMI2() const { return HasBMI2; } bool hasRTM() const { return HasRTM; } + bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } + bool hasPRFCHW() const { return HasPRFCHW; } + bool hasRDSEED() const { return HasRDSEED; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } @@ -261,6 +290,11 @@ public: bool useLeaForSP() const { return UseLeaForSP; } bool hasSlowDivide() const { return HasSlowDivide; } bool padShortFunctions() const { return PadShortFunctions; } + bool callRegIndirect() const { return CallRegIndirect; } + bool LEAusesAG() const { return LEAUsesAG; } + bool hasCDI() const { return HasCDI; } + bool hasPFI() const { return HasPFI; } + bool hasERI() const { return HasERI; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -317,7 +351,13 @@ public: } bool isPICStyleStubAny() const { return PICStyle == PICStyles::StubDynamicNoPIC || - PICStyle == PICStyles::StubPIC; } + PICStyle == PICStyles::StubPIC; + } + + bool isCallingConvWin64(CallingConv::ID CC) const { + return (isTargetWin64() && CC != CallingConv::X86_64_SysV) || + CC == CallingConv::X86_64_Win64; + } /// ClassifyGlobalReference - Classify a global variable reference for the /// current subtarget according to how we should reference it in a non-pcrel diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8aa58a2..49ebd1a 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -49,6 +49,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), JITInfo(*this) { + initAsmInfo(); } void X86_64TargetMachine::anchor() { } @@ -69,6 +70,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), JITInfo(*this) { + initAsmInfo(); } /// X86TargetMachine ctor - Create an X86 target. @@ -130,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { // Add first the target-independent BasicTTI pass, then our X86 pass. This // allows the X86 pass to delegate to the target independent layer when // appropriate. - PM.add(createBasicTargetTransformInfoPass(getTargetLowering())); + PM.add(createBasicTargetTransformInfoPass(this)); PM.add(createX86TargetTransformInfoPass(this)); } @@ -215,6 +217,11 @@ bool X86PassConfig::addPreEmitPass() { addPass(createX86PadShortFunctions()); ShouldPrint = true; } + if (getOptLevel() != CodeGenOpt::None && + getX86Subtarget().LEAusesAG()){ + addPass(createX86FixupLEAs()); + ShouldPrint = true; + } return ShouldPrint; } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 871dacd..a19c5a6 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); } + +const MCExpr * +X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 9d26d38..79c861d 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -36,6 +36,9 @@ namespace llvm { /// and x86-64. class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + + /// \brief Describe a TLS variable address within debug info. + virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const; }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index be2a997..3bbddad 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &); namespace { class X86TTI : public ImmutablePass, public TargetTransformInfo { - const X86TargetMachine *TM; const X86Subtarget *ST; const X86TargetLowering *TLI; @@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { llvm_unreachable("This pass cannot be directly constructed"); } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), TLI(TM->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } @@ -86,7 +85,9 @@ public: virtual unsigned getNumberOfRegisters(bool Vector) const; virtual unsigned getRegisterBitWidth(bool Vector) const; virtual unsigned getMaximumUnrollFactor() const; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind, + OperandValueKind) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const; virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -99,6 +100,8 @@ public: unsigned Alignment, unsigned AddressSpace) const; + virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const; + /// @} }; @@ -162,13 +165,134 @@ unsigned X86TTI::getMaximumUnrollFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { +unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Op1Info, + OperandValueKind Op2Info) const { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry<MVT> AVX2CostTable[] = { + // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to + // customize them to detect the cases where shift amount is a scalar one. + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 1 }, + { ISD::SRA, MVT::v4i32, 1 }, + { ISD::SHL, MVT::v8i32, 1 }, + { ISD::SRL, MVT::v8i32, 1 }, + { ISD::SRA, MVT::v8i32, 1 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 1 }, + { ISD::SHL, MVT::v4i64, 1 }, + { ISD::SRL, MVT::v4i64, 1 }, + + { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. + { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. + + { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. + + { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. + { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * AVX2CostTable[Idx].Cost; + } + + static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // Constant splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable, + array_lengthof(SSE2UniformConstCostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformConstCostTable[Idx].Cost; + } + + + static const CostTblEntry<MVT> SSE2CostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // For some cases, where the shift amount is a scalar we would be able + // to generate better code. Unfortunately, when this is the case the value + // (the splat) will get hoisted out of the loop, thereby making it invisible + // to ISel. The cost model must return worst case assumptions because it is + // used for vectorization and we don't want to make vectorized code worse + // than scalar code. + { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. + { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. + + { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. + { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. + + { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. + { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. + { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular + // registers. The overhead of division is going to dominate most kernels + // anyways so try hard to prevent vectorization of division - it is + // generally a bad idea. Assume somewhat arbitrarily that we have to be able + // to hide "20 cycles" for each lane. + { ISD::SDIV, MVT::v16i8, 16*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v16i8, 16*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, + }; + + if (ST->hasSSE2()) { + int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2CostTable[Idx].Cost; + } + static const CostTblEntry<MVT> AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. @@ -213,7 +337,8 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { return 6; // Fallback to the default implementation. - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty); + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, + Op2Info); } unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, @@ -235,9 +360,44 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); + std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); + + static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + }; + + if (ST->hasSSE2() && !ST->hasAVX()) { + int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl, + array_lengthof(SSE2ConvTbl), + ISD, LTDest.second, LTSrc.second); + if (Idx != -1) + return LTSrc.first * SSE2ConvTbl[Idx].Cost; + } + EVT SrcTy = TLI->getValueType(Src); EVT DstTy = TLI->getValueType(Dst); + // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); @@ -248,17 +408,40 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, - { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, + + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, }; @@ -310,19 +493,22 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, }; if (ST->hasAVX2()) { - int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * AVX2CostTbl[Idx].Cost; } if (ST->hasAVX()) { - int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * AVX1CostTbl[Idx].Cost; } if (ST->hasSSE42()) { - int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * SSE42CostTbl[Idx].Cost; } @@ -354,8 +540,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); } +unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) const { + assert (Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { + // Handle non power of two vectors such as <3 x float> + if (VectorType *VTy = dyn_cast<VectorType>(Src)) { + unsigned NumElem = VTy->getVectorNumElements(); + + // Handle a few common cases: + // <3 x float> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) + // Cost = 64 bit store + extract + 32 bit store. + return 3; + + // <3 x double> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) + // Cost = 128 bit store + unpack + 64 bit store. + return 3; + + // Assume that all other non power-of-two numbers are scalarized. + if (!isPowerOf2_32(NumElem)) { + unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, + VTy->getScalarType(), + Alignment, + AddressSpace); + unsigned SplitCost = getScalarizationOverhead(Src, + Opcode == Instruction::Load, + Opcode==Instruction::Store); + return NumElem * Cost + SplitCost; + } + } + // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -371,3 +600,16 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } + +unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); +} diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 0f77948..477f75a 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { } static bool isYmmReg(unsigned Reg) { - if (Reg >= X86::YMM0 && Reg <= X86::YMM15) - return true; + return (Reg >= X86::YMM0 && Reg <= X86::YMM31); +} - return false; +static bool isZmmReg(unsigned Reg) { + return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmReg(I->first) || isZmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { + for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } |
