aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/Android.mk1
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp1245
-rw-r--r--lib/Target/X86/CMakeLists.txt3
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp114
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.c370
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h74
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h106
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp21
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h6
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp16
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.h32
-rw-r--r--lib/Target/X86/MCTargetDesc/Android.mk2
-rw-r--r--lib/Target/X86/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h126
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp135
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp369
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp33
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp116
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp18
-rw-r--r--lib/Target/X86/README-SSE.txt31
-rw-r--r--lib/Target/X86/X86.h5
-rw-r--r--lib/Target/X86/X86.td127
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp44
-rw-r--r--lib/Target/X86/X86AsmPrinter.h5
-rw-r--r--lib/Target/X86/X86CallingConv.td51
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp19
-rw-r--r--lib/Target/X86/X86FastISel.cpp307
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp253
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp18
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp133
-rw-r--r--lib/Target/X86/X86FrameLowering.h31
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp198
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2251
-rw-r--r--lib/Target/X86/X86ISelLowering.h108
-rw-r--r--lib/Target/X86/X86Instr3DNow.td13
-rw-r--r--lib/Target/X86/X86InstrAVX512.td715
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td181
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td9
-rw-r--r--lib/Target/X86/X86InstrCompiler.td146
-rw-r--r--lib/Target/X86/X86InstrControl.td72
-rw-r--r--lib/Target/X86/X86InstrExtension.td113
-rw-r--r--lib/Target/X86/X86InstrFPStack.td56
-rw-r--r--lib/Target/X86/X86InstrFormats.td331
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td63
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp620
-rw-r--r--lib/Target/X86/X86InstrInfo.h22
-rw-r--r--lib/Target/X86/X86InstrInfo.td658
-rw-r--r--lib/Target/X86/X86InstrMMX.td105
-rw-r--r--lib/Target/X86/X86InstrSSE.td730
-rw-r--r--lib/Target/X86/X86InstrSVM.td18
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td194
-rw-r--r--lib/Target/X86/X86InstrSystem.td117
-rw-r--r--lib/Target/X86/X86InstrTSX.td14
-rw-r--r--lib/Target/X86/X86JITInfo.cpp1
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp90
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp70
-rw-r--r--lib/Target/X86/X86RegisterInfo.h6
-rw-r--r--lib/Target/X86/X86RegisterInfo.td92
-rw-r--r--lib/Target/X86/X86SchedHaswell.td132
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td127
-rw-r--r--lib/Target/X86/X86Schedule.td42
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td18
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp4
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.h4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp51
-rw-r--r--lib/Target/X86/X86Subtarget.h44
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp9
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp6
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h3
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp272
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp15
73 files changed, 8198 insertions, 3355 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index f159bb2..a9c413d 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -15,6 +15,7 @@ x86_codegen_SRC_FILES := \
X86COFFMachineModuleInfo.cpp \
X86CodeEmitter.cpp \
X86FastISel.cpp \
+ X86FixupLEAs.cpp \
X86FloatingPoint.cpp \
X86FrameLowering.cpp \
X86ISelDAGToDAG.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 4ed5534a6..ad83d97 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,10 +9,12 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
@@ -32,17 +34,453 @@ using namespace llvm;
namespace {
struct X86Operand;
+static const char OpPrecedence[] = {
+ 0, // IC_PLUS
+ 0, // IC_MINUS
+ 1, // IC_MULTIPLY
+ 1, // IC_DIVIDE
+ 2, // IC_RPAREN
+ 3, // IC_LPAREN
+ 0, // IC_IMM
+ 0 // IC_REGISTER
+};
+
class X86AsmParser : public MCTargetAsmParser {
MCSubtargetInfo &STI;
MCAsmParser &Parser;
ParseInstructionInfo *InstInfo;
private:
+ enum InfixCalculatorTok {
+ IC_PLUS = 0,
+ IC_MINUS,
+ IC_MULTIPLY,
+ IC_DIVIDE,
+ IC_RPAREN,
+ IC_LPAREN,
+ IC_IMM,
+ IC_REGISTER
+ };
+
+ class InfixCalculator {
+ typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+ SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+ SmallVector<ICToken, 4> PostfixStack;
+
+ public:
+ int64_t popOperand() {
+ assert (!PostfixStack.empty() && "Poped an empty stack!");
+ ICToken Op = PostfixStack.pop_back_val();
+ assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+ && "Expected and immediate or register!");
+ return Op.second;
+ }
+ void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+ assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+ "Unexpected operand!");
+ PostfixStack.push_back(std::make_pair(Op, Val));
+ }
+
+ void popOperator() { InfixOperatorStack.pop_back_val(); }
+ void pushOperator(InfixCalculatorTok Op) {
+ // Push the new operator if the stack is empty.
+ if (InfixOperatorStack.empty()) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // Push the new operator if it has a higher precedence than the operator
+ // on the top of the stack or the operator on the top of the stack is a
+ // left parentheses.
+ unsigned Idx = InfixOperatorStack.size() - 1;
+ InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+ if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+ InfixOperatorStack.push_back(Op);
+ return;
+ }
+
+ // The operator on the top of the stack has higher precedence than the
+ // new operator.
+ unsigned ParenCount = 0;
+ while (1) {
+ // Nothing to process.
+ if (InfixOperatorStack.empty())
+ break;
+
+ Idx = InfixOperatorStack.size() - 1;
+ StackOp = InfixOperatorStack[Idx];
+ if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+ break;
+
+ // If we have an even parentheses count and we see a left parentheses,
+ // then stop processing.
+ if (!ParenCount && StackOp == IC_LPAREN)
+ break;
+
+ if (StackOp == IC_RPAREN) {
+ ++ParenCount;
+ InfixOperatorStack.pop_back_val();
+ } else if (StackOp == IC_LPAREN) {
+ --ParenCount;
+ InfixOperatorStack.pop_back_val();
+ } else {
+ InfixOperatorStack.pop_back_val();
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+ }
+ // Push the new operator.
+ InfixOperatorStack.push_back(Op);
+ }
+ int64_t execute() {
+ // Push any remaining operators onto the postfix stack.
+ while (!InfixOperatorStack.empty()) {
+ InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+ if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+ PostfixStack.push_back(std::make_pair(StackOp, 0));
+ }
+
+ if (PostfixStack.empty())
+ return 0;
+
+ SmallVector<ICToken, 16> OperandStack;
+ for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+ ICToken Op = PostfixStack[i];
+ if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+ OperandStack.push_back(Op);
+ } else {
+ assert (OperandStack.size() > 1 && "Too few operands.");
+ int64_t Val;
+ ICToken Op2 = OperandStack.pop_back_val();
+ ICToken Op1 = OperandStack.pop_back_val();
+ switch (Op.first) {
+ default:
+ report_fatal_error("Unexpected operator!");
+ break;
+ case IC_PLUS:
+ Val = Op1.second + Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MINUS:
+ Val = Op1.second - Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_MULTIPLY:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Multiply operation with an immediate and a register!");
+ Val = Op1.second * Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_DIVIDE:
+ assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Divide operation with an immediate and a register!");
+ assert (Op2.second != 0 && "Division by zero!");
+ Val = Op1.second / Op2.second;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ }
+ }
+ }
+ assert (OperandStack.size() == 1 && "Expected a single result.");
+ return OperandStack.pop_back_val().second;
+ }
+ };
+
+ enum IntelExprState {
+ IES_PLUS,
+ IES_MINUS,
+ IES_MULTIPLY,
+ IES_DIVIDE,
+ IES_LBRAC,
+ IES_RBRAC,
+ IES_LPAREN,
+ IES_RPAREN,
+ IES_REGISTER,
+ IES_INTEGER,
+ IES_IDENTIFIER,
+ IES_ERROR
+ };
+
+ class IntelExprStateMachine {
+ IntelExprState State, PrevState;
+ unsigned BaseReg, IndexReg, TmpReg, Scale;
+ int64_t Imm;
+ const MCExpr *Sym;
+ StringRef SymName;
+ bool StopOnLBrac, AddImmPrefix;
+ InfixCalculator IC;
+ InlineAsmIdentifierInfo Info;
+ public:
+ IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+ State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+ Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac),
+ AddImmPrefix(addimmprefix) { Info.clear(); }
+
+ unsigned getBaseReg() { return BaseReg; }
+ unsigned getIndexReg() { return IndexReg; }
+ unsigned getScale() { return Scale; }
+ const MCExpr *getSym() { return Sym; }
+ StringRef getSymName() { return SymName; }
+ int64_t getImm() { return Imm + IC.execute(); }
+ bool isValidEndState() {
+ return State == IES_RBRAC || State == IES_INTEGER;
+ }
+ bool getStopOnLBrac() { return StopOnLBrac; }
+ bool getAddImmPrefix() { return AddImmPrefix; }
+ bool hadError() { return State == IES_ERROR; }
+
+ InlineAsmIdentifierInfo &getIdentifierInfo() {
+ return Info;
+ }
+
+ void onPlus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onMinus() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ case IES_RPAREN:
+ case IES_LBRAC:
+ case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_REGISTER:
+ State = IES_MINUS;
+ // Only push the minus operator if it is not a unary operator.
+ if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+ CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+ CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+ IC.pushOperator(IC_MINUS);
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRegister(unsigned Reg) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_LPAREN:
+ State = IES_REGISTER;
+ TmpReg = Reg;
+ IC.pushOperand(IC_REGISTER);
+ break;
+ case IES_MULTIPLY:
+ // Index Register - Scale * Register
+ if (PrevState == IES_INTEGER) {
+ assert (!IndexReg && "IndexReg already set!");
+ State = IES_REGISTER;
+ IndexReg = Reg;
+ // Get the scale and replace the 'Scale * Register' with '0'.
+ Scale = IC.popOperand();
+ IC.pushOperand(IC_IMM);
+ IC.popOperator();
+ } else {
+ State = IES_ERROR;
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ State = IES_INTEGER;
+ Sym = SymRef;
+ SymName = SymRefName;
+ IC.pushOperand(IC_IMM);
+ break;
+ }
+ }
+ void onInteger(int64_t TmpInt) {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_DIVIDE:
+ case IES_MULTIPLY:
+ case IES_LPAREN:
+ State = IES_INTEGER;
+ if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // Index Register - Register * Scale
+ assert (!IndexReg && "IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ // Get the scale and replace the 'Register * Scale' with '0'.
+ IC.popOperator();
+ } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+ CurrState == IES_MINUS) {
+ // Unary minus. No need to pop the minus operand because it was never
+ // pushed.
+ IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+ } else {
+ IC.pushOperand(IC_IMM, TmpInt);
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onStar() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_MULTIPLY;
+ IC.pushOperator(IC_MULTIPLY);
+ break;
+ }
+ }
+ void onDivide() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ State = IES_DIVIDE;
+ IC.pushOperator(IC_DIVIDE);
+ break;
+ }
+ }
+ void onLBrac() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_RBRAC:
+ State = IES_PLUS;
+ IC.pushOperator(IC_PLUS);
+ break;
+ }
+ }
+ void onRBrac() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RBRAC;
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // a scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ }
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLParen() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_PLUS:
+ case IES_MINUS:
+ case IES_MULTIPLY:
+ case IES_DIVIDE:
+ case IES_LPAREN:
+ // FIXME: We don't handle this type of unary minus, yet.
+ if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+ PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+ PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+ CurrState == IES_MINUS) {
+ State = IES_ERROR;
+ break;
+ }
+ State = IES_LPAREN;
+ IC.pushOperator(IC_LPAREN);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onRParen() {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_REGISTER:
+ case IES_RPAREN:
+ State = IES_RPAREN;
+ IC.pushOperator(IC_RPAREN);
+ break;
+ }
+ }
+ };
+
MCAsmParser &getParser() const { return Parser; }
MCAsmLexer &getLexer() const { return Parser.getLexer(); }
bool Error(SMLoc L, const Twine &Msg,
- ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+ ArrayRef<SMRange> Ranges = None,
bool MatchingInlineAsm = false) {
if (MatchingInlineAsm) return true;
return Parser.Error(L, Msg, Ranges);
@@ -56,14 +494,25 @@ private:
X86Operand *ParseOperand();
X86Operand *ParseATTOperand();
X86Operand *ParseIntelOperand();
- X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
- X86Operand *ParseIntelOperator(SMLoc StartLoc, unsigned OpKind);
- X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
- X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
+ X86Operand *ParseIntelOffsetOfOperator();
+ X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+ X86Operand *ParseIntelOperator(unsigned OpKind);
+ X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp,
+ SMLoc StartLoc);
+ X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+ X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp, unsigned Size);
+ X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End);
+
X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
- bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp,
- SmallString<64> &Err);
+ X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
@@ -93,6 +542,10 @@ private:
setAvailableFeatures(FB);
}
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
+
/// @name Auto-generated Matcher Functions
/// {
@@ -115,10 +568,6 @@ public:
SmallVectorImpl<MCParsedAsmOperand*> &Operands);
virtual bool ParseDirective(AsmToken DirectiveID);
-
- bool isParsingIntelSyntax() {
- return getParser().getAssemblerDialect();
- }
};
} // end anonymous namespace
@@ -168,6 +617,8 @@ struct X86Operand : public MCParsedAsmOperand {
SMLoc StartLoc, EndLoc;
SMLoc OffsetOfLoc;
+ StringRef SymName;
+ void *OpDecl;
bool AddressOf;
struct TokOp {
@@ -181,7 +632,6 @@ struct X86Operand : public MCParsedAsmOperand {
struct ImmOp {
const MCExpr *Val;
- bool NeedAsmRewrite;
};
struct MemOp {
@@ -191,7 +641,6 @@ struct X86Operand : public MCParsedAsmOperand {
unsigned IndexReg;
unsigned Scale;
unsigned Size;
- bool NeedSizeDir;
};
union {
@@ -204,6 +653,9 @@ struct X86Operand : public MCParsedAsmOperand {
X86Operand(KindTy K, SMLoc Start, SMLoc End)
: Kind(K), StartLoc(Start), EndLoc(End) {}
+ StringRef getSymName() { return SymName; }
+ void *getOpDecl() { return OpDecl; }
+
/// getStartLoc - Get the location of the first token of this operand.
SMLoc getStartLoc() const { return StartLoc; }
/// getEndLoc - Get the location of the last token of this operand.
@@ -236,11 +688,6 @@ struct X86Operand : public MCParsedAsmOperand {
return Imm.Val;
}
- bool needAsmRewrite() const {
- assert(Kind == Immediate && "Invalid access!");
- return Imm.NeedAsmRewrite;
- }
-
const MCExpr *getMemDisp() const {
assert(Kind == Memory && "Invalid access!");
return Mem.Disp;
@@ -337,11 +784,6 @@ struct X86Operand : public MCParsedAsmOperand {
return isImmSExti64i32Value(CE->getValue());
}
- unsigned getMemSize() const {
- assert(Kind == Memory && "Invalid access!");
- return Mem.Size;
- }
-
bool isOffsetOf() const {
return OffsetOfLoc.getPointer();
}
@@ -350,11 +792,6 @@ struct X86Operand : public MCParsedAsmOperand {
return AddressOf;
}
- bool needSizeDirective() const {
- assert(Kind == Memory && "Invalid access!");
- return Mem.NeedSizeDir;
- }
-
bool isMem() const { return Kind == Memory; }
bool isMem8() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 8);
@@ -394,6 +831,18 @@ struct X86Operand : public MCParsedAsmOperand {
return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
}
+ bool isMemVZ32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+ bool isMemVZ64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+
+ bool isMem512() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+ }
bool isAbsMem() const {
return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
@@ -454,6 +903,16 @@ struct X86Operand : public MCParsedAsmOperand {
addMemOperands(Inst, N);
}
+ void addMemVZ32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVZ64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem512Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
@@ -482,25 +941,28 @@ struct X86Operand : public MCParsedAsmOperand {
static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
bool AddressOf = false,
- SMLoc OffsetOfLoc = SMLoc()) {
+ SMLoc OffsetOfLoc = SMLoc(),
+ StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
Res->AddressOf = AddressOf;
Res->OffsetOfLoc = OffsetOfLoc;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
return Res;
}
- static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc,
- bool NeedRewrite = true){
+ static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
Res->Imm.Val = Val;
- Res->Imm.NeedAsmRewrite = NeedRewrite;
return Res;
}
/// Create an absolute memory operand.
static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
- unsigned Size = 0, bool NeedSizeDir = false) {
+ unsigned Size = 0, StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
@@ -508,8 +970,9 @@ struct X86Operand : public MCParsedAsmOperand {
Res->Mem.IndexReg = 0;
Res->Mem.Scale = 1;
Res->Mem.Size = Size;
- Res->Mem.NeedSizeDir = NeedSizeDir;
- Res->AddressOf = false;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
return Res;
}
@@ -517,7 +980,9 @@ struct X86Operand : public MCParsedAsmOperand {
static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
- unsigned Size = 0, bool NeedSizeDir = false) {
+ unsigned Size = 0,
+ StringRef SymName = StringRef(),
+ void *OpDecl = 0) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -532,8 +997,9 @@ struct X86Operand : public MCParsedAsmOperand {
Res->Mem.IndexReg = IndexReg;
Res->Mem.Scale = Scale;
Res->Mem.Size = Size;
- Res->Mem.NeedSizeDir = NeedSizeDir;
- Res->AddressOf = false;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = false;
return Res;
}
};
@@ -689,251 +1155,105 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
return Size;
}
-enum IntelBracExprState {
- IBES_START,
- IBES_LBRAC,
- IBES_RBRAC,
- IBES_REGISTER,
- IBES_REGISTER_STAR,
- IBES_REGISTER_STAR_INTEGER,
- IBES_INTEGER,
- IBES_INTEGER_STAR,
- IBES_INDEX_REGISTER,
- IBES_IDENTIFIER,
- IBES_DISP_EXPR,
- IBES_MINUS,
- IBES_ERROR
-};
-
-class IntelBracExprStateMachine {
- IntelBracExprState State;
- unsigned BaseReg, IndexReg, Scale;
- int64_t Disp;
-
- unsigned TmpReg;
- int64_t TmpInteger;
-
- bool isPlus;
-
-public:
- IntelBracExprStateMachine(MCAsmParser &parser) :
- State(IBES_START), BaseReg(0), IndexReg(0), Scale(1), Disp(0),
- TmpReg(0), TmpInteger(0), isPlus(true) {}
-
- unsigned getBaseReg() { return BaseReg; }
- unsigned getIndexReg() { return IndexReg; }
- unsigned getScale() { return Scale; }
- int64_t getDisp() { return Disp; }
- bool isValidEndState() { return State == IBES_RBRAC; }
-
- void onPlus() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_INTEGER:
- State = IBES_START;
- if (isPlus)
- Disp += TmpInteger;
- else
- Disp -= TmpInteger;
- break;
- case IBES_REGISTER:
- State = IBES_START;
- // If we already have a BaseReg, then assume this is the IndexReg with a
- // scale of 1.
- if (!BaseReg) {
- BaseReg = TmpReg;
- } else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
- IndexReg = TmpReg;
- Scale = 1;
- }
- break;
- case IBES_INDEX_REGISTER:
- State = IBES_START;
- break;
- }
- isPlus = true;
- }
- void onMinus() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_START:
- State = IBES_MINUS;
- break;
- case IBES_INTEGER:
- State = IBES_START;
- if (isPlus)
- Disp += TmpInteger;
- else
- Disp -= TmpInteger;
- break;
- case IBES_REGISTER:
- State = IBES_START;
- // If we already have a BaseReg, then assume this is the IndexReg with a
- // scale of 1.
- if (!BaseReg) {
- BaseReg = TmpReg;
- } else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
- IndexReg = TmpReg;
- Scale = 1;
- }
- break;
- case IBES_INDEX_REGISTER:
- State = IBES_START;
- break;
- }
- isPlus = false;
- }
- void onRegister(unsigned Reg) {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_START:
- State = IBES_REGISTER;
- TmpReg = Reg;
- break;
- case IBES_INTEGER_STAR:
- assert (!IndexReg && "IndexReg already set!");
- State = IBES_INDEX_REGISTER;
- IndexReg = Reg;
- Scale = TmpInteger;
- break;
- }
- }
- void onDispExpr() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_START:
- State = IBES_DISP_EXPR;
- break;
- }
- }
- void onInteger(int64_t TmpInt) {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_START:
- State = IBES_INTEGER;
- TmpInteger = TmpInt;
- break;
- case IBES_MINUS:
- State = IBES_INTEGER;
- TmpInteger = TmpInt;
- break;
- case IBES_REGISTER_STAR:
- assert (!IndexReg && "IndexReg already set!");
- State = IBES_INDEX_REGISTER;
- IndexReg = TmpReg;
- Scale = TmpInt;
- break;
- }
- }
- void onStar() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_INTEGER:
- State = IBES_INTEGER_STAR;
- break;
- case IBES_REGISTER:
- State = IBES_REGISTER_STAR;
- break;
+X86Operand *
+X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ InlineAsmIdentifierInfo &Info){
+ if (isa<MCSymbolRefExpr>(Disp)) {
+ // If this is not a VarDecl then assume it is a FuncDecl or some other label
+ // reference. We need an 'r' constraint here, so we need to create register
+ // operand to ensure proper matching. Just pick a GPR based on the size of
+ // a pointer.
+ if (!Info.IsVarDecl) {
+ unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+ return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
+ SMLoc(), Identifier, Info.OpDecl);
}
- }
- void onLBrac() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_RBRAC:
- State = IBES_START;
- isPlus = true;
- break;
+ if (!Size) {
+ Size = Info.Type * 8; // Size is in terms of bits in this context.
+ if (Size)
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
+ /*Len=*/0, Size));
}
}
- void onRBrac() {
- switch (State) {
- default:
- State = IBES_ERROR;
- break;
- case IBES_DISP_EXPR:
- State = IBES_RBRAC;
- break;
- case IBES_INTEGER:
- State = IBES_RBRAC;
- if (isPlus)
- Disp += TmpInteger;
- else
- Disp -= TmpInteger;
- break;
- case IBES_REGISTER:
- State = IBES_RBRAC;
- // If we already have a BaseReg, then assume this is the IndexReg with a
- // scale of 1.
- if (!BaseReg) {
- BaseReg = TmpReg;
- } else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
- IndexReg = TmpReg;
- Scale = 1;
+
+ // When parsing inline assembly we set the base register to a non-zero value
+ // if we don't know the actual value at this time. This is necessary to
+ // get the matching correct in some cases.
+ BaseReg = BaseReg ? BaseReg : 1;
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, Identifier, Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
+ StringRef SymName, int64_t ImmDisp,
+ int64_t FinalImmDisp, SMLoc &BracLoc,
+ SMLoc &StartInBrac, SMLoc &End) {
+ // Remove the '[' and ']' from the IR string.
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1));
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1));
+
+ // If ImmDisp is non-zero, then we parsed a displacement before the
+ // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+ // If ImmDisp doesn't match the displacement computed by the state machine
+ // then we have an additional displacement in the bracketed expression.
+ if (ImmDisp != FinalImmDisp) {
+ if (ImmDisp) {
+ // We have an immediate displacement before the bracketed expression.
+ // Adjust this to match the final immediate displacement.
+ bool Found = false;
+ for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+ E = AsmRewrites->end(); I != E; ++I) {
+ if ((*I).Loc.getPointer() > BracLoc.getPointer())
+ continue;
+ if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) {
+ assert (!Found && "ImmDisp already rewritten.");
+ (*I).Kind = AOK_Imm;
+ (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer();
+ (*I).Val = FinalImmDisp;
+ Found = true;
+ break;
+ }
}
- break;
- case IBES_INDEX_REGISTER:
- State = IBES_RBRAC;
- break;
+ assert (Found && "Unable to rewrite ImmDisp.");
+ (void)Found;
+ } else {
+ // We have a symbolic and an immediate displacement, but no displacement
+ // before the bracketed expression. Put the immediate displacement
+ // before the bracketed expression.
+ AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp));
}
}
-};
+ // Remove all the ImmPrefix rewrites within the brackets.
+ for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
+ E = AsmRewrites->end(); I != E; ++I) {
+ if ((*I).Loc.getPointer() < StartInBrac.getPointer())
+ continue;
+ if ((*I).Kind == AOK_ImmPrefix)
+ (*I).Kind = AOK_Delete;
+ }
+ const char *SymLocPtr = SymName.data();
+ // Skip everything before the symbol.
+ if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
+ }
+ // Skip everything after the symbol.
+ if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+ SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+ assert(Len > 0 && "Expected a non-negative length.");
+ AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len));
+ }
+}
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
- unsigned Size) {
+X86Operand *
+X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
const AsmToken &Tok = Parser.getTok();
- SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc();
-
- // Eat '['
- if (getLexer().isNot(AsmToken::LBrac))
- return ErrorOperand(Start, "Expected '[' token!");
- Parser.Lex();
- unsigned TmpReg = 0;
-
- // Try to handle '[' 'symbol' ']'
- if (getLexer().is(AsmToken::Identifier)) {
- if (ParseRegister(TmpReg, Start, End)) {
- const MCExpr *Disp;
- if (getParser().parseExpression(Disp, End))
- return 0;
-
- if (getLexer().isNot(AsmToken::RBrac))
- return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!");
- // Adjust the EndLoc due to the ']'.
- End = SMLoc::getFromPointer(Parser.getTok().getEndLoc().getPointer()-1);
- Parser.Lex();
- return X86Operand::CreateMem(Disp, Start, End, Size);
- }
- }
-
- // Parse [ BaseReg + Scale*IndexReg + Disp ].
bool Done = false;
- IntelBracExprStateMachine SM(Parser);
-
- // If we parsed a register, then the end loc has already been set and
- // the identifier has already been lexed. We also need to update the
- // state.
- if (TmpReg)
- SM.onRegister(TmpReg);
-
- const MCExpr *Disp = 0;
while (!Done) {
bool UpdateLocLex = true;
@@ -941,6 +1261,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
// identifier. Don't try an parse it as a register.
if (Tok.getString().startswith("."))
break;
+
+ // If we're parsing an immediate expression, we don't expect a '['.
+ if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+ break;
switch (getLexer().getKind()) {
default: {
@@ -950,82 +1274,185 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
}
return ErrorOperand(Tok.getLoc(), "Unexpected token!");
}
+ case AsmToken::EndOfStatement: {
+ Done = true;
+ break;
+ }
case AsmToken::Identifier: {
- // This could be a register or a displacement expression.
- if(!ParseRegister(TmpReg, Start, End)) {
+ // This could be a register or a symbolic displacement.
+ unsigned TmpReg;
+ const MCExpr *Val;
+ SMLoc IdentLoc = Tok.getLoc();
+ StringRef Identifier = Tok.getString();
+ if(!ParseRegister(TmpReg, IdentLoc, End)) {
SM.onRegister(TmpReg);
UpdateLocLex = false;
break;
- } else if (!getParser().parseExpression(Disp, End)) {
- SM.onDispExpr();
+ } else {
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+ } else {
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
+ }
+ SM.onIdentifierExpr(Val, Identifier);
UpdateLocLex = false;
break;
}
return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
}
- case AsmToken::Integer: {
- int64_t Val = Tok.getIntVal();
- SM.onInteger(Val);
+ case AsmToken::Integer:
+ if (isParsingInlineAsm() && SM.getAddImmPrefix())
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+ Tok.getLoc()));
+ SM.onInteger(Tok.getIntVal());
break;
- }
case AsmToken::Plus: SM.onPlus(); break;
case AsmToken::Minus: SM.onMinus(); break;
case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::Slash: SM.onDivide(); break;
case AsmToken::LBrac: SM.onLBrac(); break;
case AsmToken::RBrac: SM.onRBrac(); break;
+ case AsmToken::LParen: SM.onLParen(); break;
+ case AsmToken::RParen: SM.onRParen(); break;
}
+ if (SM.hadError())
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+
if (!Done && UpdateLocLex) {
End = Tok.getLoc();
Parser.Lex(); // Consume the token.
}
}
+ return 0;
+}
- if (!Disp)
- Disp = MCConstantExpr::Create(SM.getDisp(), getContext());
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+ int64_t ImmDisp,
+ unsigned Size) {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(BracLoc, "Expected '[' token!");
+ Parser.Lex(); // Eat '['
+
+ SMLoc StartInBrac = Tok.getLoc();
+ // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We
+ // may have already parsed an immediate displacement before the bracketed
+ // expression.
+ IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+ if (X86Operand *Err = ParseIntelExpression(SM, End))
+ return Err;
+
+ const MCExpr *Disp;
+ if (const MCExpr *Sym = SM.getSym()) {
+ // A symbolic displacement.
+ Disp = Sym;
+ if (isParsingInlineAsm())
+ RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
+ ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+ End);
+ } else {
+ // An immediate displacement only.
+ Disp = MCConstantExpr::Create(SM.getImm(), getContext());
+ }
// Parse the dot operator (e.g., [ebx].foo.bar).
if (Tok.getString().startswith(".")) {
- SmallString<64> Err;
const MCExpr *NewDisp;
- if (ParseIntelDotOperator(Disp, &NewDisp, Err))
- return ErrorOperand(Tok.getLoc(), Err);
+ if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp))
+ return Err;
- End = Parser.getTok().getEndLoc();
+ End = Tok.getEndLoc();
Parser.Lex(); // Eat the field.
Disp = NewDisp;
}
int BaseReg = SM.getBaseReg();
int IndexReg = SM.getIndexReg();
+ int Scale = SM.getScale();
+ if (!isParsingInlineAsm()) {
+ // handle [-42]
+ if (!BaseReg && !IndexReg) {
+ if (!SegReg)
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+ else
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+ }
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size);
+ }
- // handle [-42]
- if (!BaseReg && !IndexReg) {
- if (!SegReg)
- return X86Operand::CreateMem(Disp, Start, End);
- else
- return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+ InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(), Info);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+ StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand,
+ SMLoc &End) {
+ assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ Val = 0;
+
+ StringRef LineBuf(Identifier.data());
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+ const AsmToken &Tok = Parser.getTok();
+
+ // Advance the token stream until the end of the current token is
+ // after the end of what the frontend claimed.
+ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+ while (true) {
+ End = Tok.getEndLoc();
+ getLexer().Lex();
+
+ assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
+ if (End.getPointer() == EndPtr) break;
}
- int Scale = SM.getScale();
- return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
- Start, End, Size);
+ // Create the symbol reference.
+ Identifier = LineBuf;
+ MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+ MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+ Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
+ return 0;
}
/// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
+X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
+ int64_t ImmDisp,
+ SMLoc Start) {
const AsmToken &Tok = Parser.getTok();
SMLoc End;
unsigned Size = getIntelMemOperandSize(Tok.getString());
if (Size) {
- Parser.Lex();
- assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") &&
- "Unexpected token!");
- Parser.Lex();
+ Parser.Lex(); // Eat operand size (e.g., byte, word).
+ if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+ return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+ Parser.Lex(); // Eat ptr.
+ }
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ if (getLexer().is(AsmToken::Integer)) {
+ if (isParsingInlineAsm())
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+ Tok.getLoc()));
+ int64_t ImmDisp = Tok.getIntVal();
+ Parser.Lex(); // Eat the integer.
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(Start, "Expected '[' token!");
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
}
if (getLexer().is(AsmToken::LBrac))
- return ParseIntelBracExpression(SegReg, Size);
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
if (!ParseRegister(SegReg, Start, End)) {
// Handel SegReg : [ ... ]
@@ -1034,63 +1461,37 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
Parser.Lex(); // Eat :
if (getLexer().isNot(AsmToken::LBrac))
return ErrorOperand(Start, "Expected '[' token!");
- return ParseIntelBracExpression(SegReg, Size);
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
}
- const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
- if (getParser().parseExpression(Disp, End))
- return 0;
+ const MCExpr *Val;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End))
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
- bool NeedSizeDir = false;
- bool IsVarDecl = false;
- if (isParsingInlineAsm()) {
- if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
- const MCSymbol &Sym = SymRef->getSymbol();
- // FIXME: The SemaLookup will fail if the name is anything other then an
- // identifier.
- // FIXME: Pass a valid SMLoc.
- unsigned tLength, tSize, tType;
- SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, tLength,
- tSize, tType, IsVarDecl);
- if (!Size)
- Size = tType * 8; // Size is in terms of bits in this context.
- NeedSizeDir = Size > 0;
- }
+ return X86Operand::CreateMem(Val, Start, End, Size);
}
- if (!isParsingInlineAsm())
- return X86Operand::CreateMem(Disp, Start, End, Size);
- else {
- // If this is not a VarDecl then assume it is a FuncDecl or some other label
- // reference. We need an 'r' constraint here, so we need to create register
- // operand to ensure proper matching. Just pick a GPR based on the size of
- // a pointer.
- if (!IsVarDecl) {
- unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
- return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true);
- }
- // When parsing inline assembly we set the base register to a non-zero value
- // as we don't know the actual value at this time. This is necessary to
- // get the matching correct in some cases.
- return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
- /*Scale*/1, Start, End, Size, NeedSizeDir);
- }
+ InlineAsmIdentifierInfo Info;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
+ return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size, Identifier, Info);
}
/// Parse the '.' operator.
-bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
- const MCExpr **NewDisp,
- SmallString<64> &Err) {
- AsmToken Tok = *&Parser.getTok();
- uint64_t OrigDispVal, DotDispVal;
+X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+ const MCExpr *&NewDisp) {
+ const AsmToken &Tok = Parser.getTok();
+ int64_t OrigDispVal, DotDispVal;
// FIXME: Handle non-constant expressions.
- if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) {
+ if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
OrigDispVal = OrigDisp->getValue();
- } else {
- Err = "Non-constant offsets are not supported!";
- return true;
- }
+ else
+ return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!");
// Drop the '.'.
StringRef DotDispStr = Tok.getString().drop_front(1);
@@ -1100,23 +1501,15 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
APInt DotDisp;
DotDispStr.getAsInteger(10, DotDisp);
DotDispVal = DotDisp.getZExtValue();
- } else if (Tok.is(AsmToken::Identifier)) {
- // We should only see an identifier when parsing the original inline asm.
- // The front-end should rewrite this in terms of immediates.
- assert (isParsingInlineAsm() && "Unexpected field name!");
-
+ } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
unsigned DotDisp;
std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
- DotDisp)) {
- Err = "Unable to lookup field reference!";
- return true;
- }
+ DotDisp))
+ return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!");
DotDispVal = DotDisp;
- } else {
- Err = "Unexpected token type!";
- return true;
- }
+ } else
+ return ErrorOperand(Tok.getLoc(), "Unexpected token type!");
if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
@@ -1126,22 +1519,24 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
Val));
}
- *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
- return false;
+ NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+ return 0;
}
/// Parse the 'offset' operator. This operator is used to specify the
/// location rather then the content of a variable.
-X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
- SMLoc OffsetOfLoc = Start;
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc OffsetOfLoc = Tok.getLoc();
Parser.Lex(); // Eat offset.
- Start = Parser.getTok().getLoc();
- assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
- SMLoc End;
const MCExpr *Val;
- if (getParser().parseExpression(Val, End))
- return ErrorOperand(Start, "Unable to parse expression!");
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ false, End))
+ return Err;
// Don't emit the offset operator.
InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1151,7 +1546,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
// the size of a pointer.
unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
- OffsetOfLoc);
+ OffsetOfLoc, Identifier, Info.OpDecl);
}
enum IntelOperatorKind {
@@ -1166,34 +1561,25 @@ enum IntelOperatorKind {
/// variable. A variable's size is the product of its LENGTH and TYPE. The
/// TYPE operator returns the size of a C or C++ type or variable. If the
/// variable is an array, TYPE returns the size of a single element.
-X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) {
- SMLoc TypeLoc = Start;
- Parser.Lex(); // Eat offset.
- Start = Parser.getTok().getLoc();
- assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
-
- SMLoc End;
- const MCExpr *Val;
- if (getParser().parseExpression(Val, End))
- return 0;
-
- unsigned Length = 0, Size = 0, Type = 0;
- if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
- const MCSymbol &Sym = SymRef->getSymbol();
- // FIXME: The SemaLookup will fail if the name is anything other then an
- // identifier.
- // FIXME: Pass a valid SMLoc.
- bool IsVarDecl;
- if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Length,
- Size, Type, IsVarDecl))
- return ErrorOperand(Start, "Unable to lookup expr!");
- }
- unsigned CVal;
+X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc TypeLoc = Tok.getLoc();
+ Parser.Lex(); // Eat operator.
+
+ const MCExpr *Val = 0;
+ InlineAsmIdentifierInfo Info;
+ SMLoc Start = Tok.getLoc(), End;
+ StringRef Identifier = Tok.getString();
+ if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+ /*Unevaluated*/ true, End))
+ return Err;
+
+ unsigned CVal = 0;
switch(OpKind) {
default: llvm_unreachable("Unexpected operand kind!");
- case IOK_LENGTH: CVal = Length; break;
- case IOK_SIZE: CVal = Size; break;
- case IOK_TYPE: CVal = Type; break;
+ case IOK_LENGTH: CVal = Info.Length; break;
+ case IOK_SIZE: CVal = Info.Size; break;
+ case IOK_TYPE: CVal = Info.Type; break;
}
// Rewrite the type operator and the C or C++ type or variable in terms of an
@@ -1202,32 +1588,58 @@ X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) {
InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
- return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false);
+ return X86Operand::CreateImm(Imm, Start, End);
}
X86Operand *X86AsmParser::ParseIntelOperand() {
- SMLoc Start = Parser.getTok().getLoc(), End;
- StringRef AsmTokStr = Parser.getTok().getString();
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start = Tok.getLoc(), End;
// Offset, length, type and size operators.
if (isParsingInlineAsm()) {
+ StringRef AsmTokStr = Tok.getString();
if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
- return ParseIntelOffsetOfOperator(Start);
+ return ParseIntelOffsetOfOperator();
if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
- return ParseIntelOperator(Start, IOK_LENGTH);
+ return ParseIntelOperator(IOK_LENGTH);
if (AsmTokStr == "size" || AsmTokStr == "SIZE")
- return ParseIntelOperator(Start, IOK_SIZE);
+ return ParseIntelOperator(IOK_SIZE);
if (AsmTokStr == "type" || AsmTokStr == "TYPE")
- return ParseIntelOperator(Start, IOK_TYPE);
+ return ParseIntelOperator(IOK_TYPE);
}
// Immediate.
- if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
- getLexer().is(AsmToken::Minus)) {
- const MCExpr *Val;
- if (!getParser().parseExpression(Val, End)) {
- return X86Operand::CreateImm(Val, Start, End);
+ if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
+ getLexer().is(AsmToken::LParen)) {
+ AsmToken StartTok = Tok;
+ IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (X86Operand *Err = ParseIntelExpression(SM, End))
+ return Err;
+
+ int64_t Imm = SM.getImm();
+ if (isParsingInlineAsm()) {
+ unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+ if (StartTok.getString().size() == Len)
+ // Just add a prefix if this wasn't a complex immediate expression.
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+ else
+ // Otherwise, rewrite the complex expression as a single immediate.
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm));
+ }
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
+ return X86Operand::CreateImm(ImmExpr, Start, End);
}
+
+ // Only positive immediates are valid.
+ if (Imm < 0)
+ return ErrorOperand(Start, "expected a positive immediate displacement "
+ "before bracketed expr.");
+
+ // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+ return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start);
}
// Register.
@@ -1239,11 +1651,11 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
return X86Operand::CreateReg(RegNo, Start, End);
getParser().Lex(); // Eat the colon.
- return ParseIntelMemOperand(RegNo, Start);
+ return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
}
// Memory operand.
- return ParseIntelMemOperand(0, Start);
+ return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
}
X86Operand *X86AsmParser::ParseATTOperand() {
@@ -1267,7 +1679,6 @@ X86Operand *X86AsmParser::ParseATTOperand() {
if (getLexer().isNot(AsmToken::Colon))
return X86Operand::CreateReg(RegNo, Start, End);
-
getParser().Lex(); // Eat the colon.
return ParseMemOperand(RegNo, Start);
}
@@ -1819,7 +2230,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
assert(!Operands.empty() && "Unexpect empty operand list!");
X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
assert(Op->isToken() && "Leading operand should always be a mnemonic!");
- ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
+ ArrayRef<SMRange> EmptyRanges = None;
// First, handle aliases that expand to multiple instructions.
// FIXME: This should be replaced with a real .td file alias mechanism.
@@ -1921,25 +2332,25 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
unsigned Match1, Match2, Match3, Match4;
Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- isParsingIntelSyntax());
+ MatchingInlineAsm, isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match1 == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
Tmp[Base.size()] = Suffixes[1];
Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- isParsingIntelSyntax());
+ MatchingInlineAsm, isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match2 == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
Tmp[Base.size()] = Suffixes[2];
Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- isParsingIntelSyntax());
+ MatchingInlineAsm, isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match3 == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
Tmp[Base.size()] = Suffixes[3];
Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- isParsingIntelSyntax());
+ MatchingInlineAsm, isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match4 == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index d14899d..7e20151 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -33,6 +33,7 @@ set(sources
X86TargetObjectFile.cpp
X86TargetTransformInfo.cpp
X86VZeroUpper.cpp
+ X86FixupLEAs.cpp
)
if( CMAKE_CL_64 )
@@ -52,7 +53,7 @@ endif()
add_llvm_target(X86CodeGen ${sources})
-add_dependencies(LLVMX86CodeGen intrinsics_gen)
+add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ca6f80c..82af6fa 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -190,94 +190,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
uint64_t Address, uint64_t Offset,
uint64_t Width, MCInst &MI,
const MCDisassembler *Dis) {
- LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
- struct LLVMOpInfo1 SymbolicOp;
- memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
- SymbolicOp.Value = Value;
- void *DisInfo = Dis->getDisInfoBlock();
-
- if (!getOpInfo ||
- !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
- // Clear SymbolicOp.Value from above and also all other fields.
- memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
- LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
- if (!SymbolLookUp)
- return false;
- uint64_t ReferenceType;
- if (isBranch)
- ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
- else
- ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
- const char *ReferenceName;
- const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
- &ReferenceName);
- if (Name) {
- SymbolicOp.AddSymbol.Name = Name;
- SymbolicOp.AddSymbol.Present = true;
- }
- // For branches always create an MCExpr so it gets printed as hex address.
- else if (isBranch) {
- SymbolicOp.Value = Value;
- }
- if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
- (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
- if (!Name && !isBranch)
- return false;
- }
-
- MCContext *Ctx = Dis->getMCContext();
- const MCExpr *Add = NULL;
- if (SymbolicOp.AddSymbol.Present) {
- if (SymbolicOp.AddSymbol.Name) {
- StringRef Name(SymbolicOp.AddSymbol.Name);
- MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
- Add = MCSymbolRefExpr::Create(Sym, *Ctx);
- } else {
- Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
- }
- }
-
- const MCExpr *Sub = NULL;
- if (SymbolicOp.SubtractSymbol.Present) {
- if (SymbolicOp.SubtractSymbol.Name) {
- StringRef Name(SymbolicOp.SubtractSymbol.Name);
- MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
- Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
- } else {
- Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
- }
- }
-
- const MCExpr *Off = NULL;
- if (SymbolicOp.Value != 0)
- Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
- const MCExpr *Expr;
- if (Sub) {
- const MCExpr *LHS;
- if (Add)
- LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
- else
- LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
- if (Off != 0)
- Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
- else
- Expr = LHS;
- } else if (Add) {
- if (Off != 0)
- Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
- else
- Expr = Add;
- } else {
- if (Off != 0)
- Expr = Off;
- else
- Expr = MCConstantExpr::Create(0, *Ctx);
- }
-
- MI.addOperand(MCOperand::CreateExpr(Expr));
-
- return true;
+ return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+ Offset, Width);
}
/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -290,15 +204,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
const void *Decoder) {
const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
- LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
- if (SymbolLookUp) {
- void *DisInfo = Dis->getDisInfoBlock();
- uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
- const char *ReferenceName;
- (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
- if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
- (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
- }
+ Dis->tryAddingPcLoadReferenceComment(Value, Address);
}
/// translateImmediate - Appends an immediate operand to an MCInst.
@@ -380,6 +286,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case TYPE_XMM256:
mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
return;
+ case TYPE_XMM512:
+ mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4)));
+ return;
case TYPE_REL8:
isBranch = true;
pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
@@ -537,6 +446,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
EA_BASES_64BIT
REGS_XMM
REGS_YMM
+ REGS_ZMM
#undef ENTRY
}
} else {
@@ -659,6 +569,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM64:
case TYPE_XMM128:
case TYPE_XMM256:
+ case TYPE_XMM512:
case TYPE_DEBUGREG:
case TYPE_CONTROLREG:
return translateRMRegister(mcInst, insn);
@@ -777,6 +688,15 @@ static bool translateInstruction(MCInst &mcInst,
}
mcInst.setOpcode(insn.instructionID);
+ // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+ // prefix bytes should be disassembled as xrelease and xacquire then set the
+ // opcode to those instead of the rep and repne opcodes.
+ if (insn.xAcquireRelease) {
+ if(mcInst.getOpcode() == X86::REP_PREFIX)
+ mcInst.setOpcode(X86::XRELEASE_PREFIX);
+ else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+ mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+ }
int index;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 85d8a99..bb195ee 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -61,7 +61,7 @@ static int modRMRequired(OpcodeType type,
InstructionContext insnContext,
uint8_t opcode) {
const struct ContextDecision* decision = 0;
-
+
switch (type) {
case ONEBYTE:
decision = &ONEBYTE_SYM;
@@ -102,7 +102,7 @@ static InstrUID decode(OpcodeType type,
uint8_t opcode,
uint8_t modRM) {
const struct ModRMDecision* dec = 0;
-
+
switch (type) {
case ONEBYTE:
dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
@@ -123,7 +123,7 @@ static InstrUID decode(OpcodeType type,
dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
break;
}
-
+
switch (dec->modrm_type) {
default:
debug("Corrupt table! Unknown modrm_type");
@@ -171,10 +171,10 @@ static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
*/
static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
-
+
if (!ret)
++(insn->readerCursor);
-
+
return ret;
}
@@ -238,19 +238,19 @@ CONSUME_FUNC(consumeUInt64, uint64_t)
*/
static void dbgprintf(struct InternalInstruction* insn,
const char* format,
- ...) {
+ ...) {
char buffer[256];
va_list ap;
-
+
if (!insn->dlog)
return;
-
+
va_start(ap, format);
(void)vsnprintf(buffer, sizeof(buffer), format, ap);
va_end(ap);
-
+
insn->dlog(insn->dlogArg, buffer);
-
+
return;
}
@@ -305,27 +305,61 @@ static int readPrefixes(struct InternalInstruction* insn) {
BOOL prefixGroups[4] = { FALSE };
uint64_t prefixLocation;
uint8_t byte = 0;
-
+
BOOL hasAdSize = FALSE;
BOOL hasOpSize = FALSE;
-
+
dbgprintf(insn, "readPrefixes()");
-
+
while (isPrefix) {
prefixLocation = insn->readerCursor;
-
+
if (consumeByte(insn, &byte))
return -1;
/*
- * If the first byte is a LOCK prefix break and let it be disassembled
- * as a lock "instruction", by creating an <MCInst #xxxx LOCK_PREFIX>.
- * FIXME there is currently no way to get the disassembler to print the
- * lock prefix if it is not the first byte.
+ * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ * break and let it be disassembled as a normal "instruction".
*/
- if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
- break;
-
+ if (insn->readerCursor - 1 == insn->startLocation
+ && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
+ uint8_t nextByte;
+ if (byte == 0xf0)
+ break;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ /*
+ * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ * met:
+ * - it is followed by a LOCK (0xf0) prefix
+ * - it is followed by an xchg instruction
+ * then it should be disassembled as a xacquire/xrelease not repne/rep.
+ */
+ if ((byte == 0xf2 || byte == 0xf3) &&
+ ((nextByte == 0xf0) |
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+ insn->xAcquireRelease = TRUE;
+ /*
+ * Also if the byte is 0xf3, and the following condition is met:
+ * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ * "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ * then it should be disassembled as an xrelease not rep.
+ */
+ if (byte == 0xf3 &&
+ (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7))
+ insn->xAcquireRelease = TRUE;
+ if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+ if (consumeByte(insn, &nextByte))
+ return -1;
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ unconsumeByte(insn);
+ }
+ if (nextByte != 0x0f && nextByte != 0x90)
+ break;
+ }
+
switch (byte) {
case 0xf0: /* LOCK */
case 0xf2: /* REPNE/REPNZ */
@@ -387,21 +421,21 @@ static int readPrefixes(struct InternalInstruction* insn) {
isPrefix = FALSE;
break;
}
-
+
if (isPrefix)
dbgprintf(insn, "Found prefix 0x%hhx", byte);
}
-
+
insn->vexSize = 0;
-
+
if (byte == 0xc4) {
uint8_t byte1;
-
+
if (lookAtByte(insn, &byte1)) {
dbgprintf(insn, "Couldn't read second byte of VEX");
return -1;
}
-
+
if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
insn->vexSize = 3;
insn->necessaryPrefixLocation = insn->readerCursor - 1;
@@ -410,67 +444,67 @@ static int readPrefixes(struct InternalInstruction* insn) {
unconsumeByte(insn);
insn->necessaryPrefixLocation = insn->readerCursor - 1;
}
-
+
if (insn->vexSize == 3) {
insn->vexPrefix[0] = byte;
consumeByte(insn, &insn->vexPrefix[1]);
consumeByte(insn, &insn->vexPrefix[2]);
/* We simulate the REX prefix for simplicity's sake */
-
+
if (insn->mode == MODE_64BIT) {
- insn->rexPrefix = 0x40
+ insn->rexPrefix = 0x40
| (wFromVEX3of3(insn->vexPrefix[2]) << 3)
| (rFromVEX2of3(insn->vexPrefix[1]) << 2)
| (xFromVEX2of3(insn->vexPrefix[1]) << 1)
| (bFromVEX2of3(insn->vexPrefix[1]) << 0);
}
-
+
switch (ppFromVEX3of3(insn->vexPrefix[2]))
{
default:
break;
case VEX_PREFIX_66:
- hasOpSize = TRUE;
+ hasOpSize = TRUE;
break;
}
-
+
dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
}
}
else if (byte == 0xc5) {
uint8_t byte1;
-
+
if (lookAtByte(insn, &byte1)) {
dbgprintf(insn, "Couldn't read second byte of VEX");
return -1;
}
-
+
if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
insn->vexSize = 2;
}
else {
unconsumeByte(insn);
}
-
+
if (insn->vexSize == 2) {
insn->vexPrefix[0] = byte;
consumeByte(insn, &insn->vexPrefix[1]);
-
+
if (insn->mode == MODE_64BIT) {
- insn->rexPrefix = 0x40
+ insn->rexPrefix = 0x40
| (rFromVEX2of2(insn->vexPrefix[1]) << 2);
}
-
+
switch (ppFromVEX2of2(insn->vexPrefix[1]))
{
default:
break;
case VEX_PREFIX_66:
- hasOpSize = TRUE;
+ hasOpSize = TRUE;
break;
}
-
+
dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
}
}
@@ -478,17 +512,17 @@ static int readPrefixes(struct InternalInstruction* insn) {
if (insn->mode == MODE_64BIT) {
if ((byte & 0xf0) == 0x40) {
uint8_t opcodeByte;
-
+
if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
dbgprintf(insn, "Redundant REX prefix");
return -1;
}
-
+
insn->rexPrefix = byte;
insn->necessaryPrefixLocation = insn->readerCursor - 2;
-
+
dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
- } else {
+ } else {
unconsumeByte(insn);
insn->necessaryPrefixLocation = insn->readerCursor - 1;
}
@@ -526,7 +560,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
insn->immediateSize = (hasOpSize ? 2 : 4);
}
}
-
+
return 0;
}
@@ -537,22 +571,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
* @param insn - The instruction whose opcode is to be read.
* @return - 0 if the opcode could be read successfully; nonzero otherwise.
*/
-static int readOpcode(struct InternalInstruction* insn) {
+static int readOpcode(struct InternalInstruction* insn) {
/* Determine the length of the primary opcode */
-
+
uint8_t current;
-
+
dbgprintf(insn, "readOpcode()");
-
+
insn->opcodeType = ONEBYTE;
-
+
if (insn->vexSize == 3)
{
switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
{
default:
dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
- return -1;
+ return -1;
case 0:
break;
case VEX_LOB_0F:
@@ -564,7 +598,7 @@ static int readOpcode(struct InternalInstruction* insn) {
insn->threeByteEscape = 0x38;
insn->opcodeType = THREEBYTE_38;
return consumeByte(insn, &insn->opcode);
- case VEX_LOB_0F3A:
+ case VEX_LOB_0F3A:
insn->twoByteEscape = 0x0f;
insn->threeByteEscape = 0x3a;
insn->opcodeType = THREEBYTE_3A;
@@ -577,68 +611,68 @@ static int readOpcode(struct InternalInstruction* insn) {
insn->opcodeType = TWOBYTE;
return consumeByte(insn, &insn->opcode);
}
-
+
if (consumeByte(insn, &current))
return -1;
-
+
if (current == 0x0f) {
dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
-
+
insn->twoByteEscape = current;
-
+
if (consumeByte(insn, &current))
return -1;
-
+
if (current == 0x38) {
dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
+
insn->threeByteEscape = current;
-
+
if (consumeByte(insn, &current))
return -1;
-
+
insn->opcodeType = THREEBYTE_38;
} else if (current == 0x3a) {
dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
+
insn->threeByteEscape = current;
-
+
if (consumeByte(insn, &current))
return -1;
-
+
insn->opcodeType = THREEBYTE_3A;
} else if (current == 0xa6) {
dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
+
insn->threeByteEscape = current;
-
+
if (consumeByte(insn, &current))
return -1;
-
+
insn->opcodeType = THREEBYTE_A6;
} else if (current == 0xa7) {
dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
+
insn->threeByteEscape = current;
-
+
if (consumeByte(insn, &current))
return -1;
-
+
insn->opcodeType = THREEBYTE_A7;
} else {
dbgprintf(insn, "Didn't find a three-byte escape prefix");
-
+
insn->opcodeType = TWOBYTE;
}
}
-
+
/*
* At this point we have consumed the full opcode.
* Anything we consume from here on must be unconsumed.
*/
-
+
insn->opcode = current;
-
+
return 0;
}
@@ -660,19 +694,19 @@ static int getIDWithAttrMask(uint16_t* instructionID,
struct InternalInstruction* insn,
uint8_t attrMask) {
BOOL hasModRMExtension;
-
+
uint8_t instructionClass;
instructionClass = contextForAttrs(attrMask);
-
+
hasModRMExtension = modRMRequired(insn->opcodeType,
instructionClass,
insn->opcode);
-
+
if (hasModRMExtension) {
if (readModRM(insn))
return -1;
-
+
*instructionID = decode(insn->opcodeType,
instructionClass,
insn->opcode,
@@ -683,7 +717,7 @@ static int getIDWithAttrMask(uint16_t* instructionID,
insn->opcode,
0);
}
-
+
return 0;
}
@@ -696,7 +730,7 @@ static int getIDWithAttrMask(uint16_t* instructionID,
*/
static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
off_t i;
-
+
for (i = 0;; i++) {
if (orig[i] == '\0' && equiv[i] == '\0')
return TRUE;
@@ -715,8 +749,8 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
}
/*
- * getID - Determines the ID of an instruction, consuming the ModR/M byte as
- * appropriate for extended and escape opcodes. Determines the attributes and
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ * appropriate for extended and escape opcodes. Determines the attributes and
* context for the instruction before doing so.
*
* @param insn - The instruction whose ID is to be determined.
@@ -726,21 +760,21 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
static int getID(struct InternalInstruction* insn, const void *miiArg) {
uint8_t attrMask;
uint16_t instructionID;
-
+
dbgprintf(insn, "getID()");
-
+
attrMask = ATTR_NONE;
if (insn->mode == MODE_64BIT)
attrMask |= ATTR_64BIT;
-
+
if (insn->vexSize) {
attrMask |= ATTR_VEX;
if (insn->vexSize == 3) {
switch (ppFromVEX3of3(insn->vexPrefix[2])) {
case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
+ attrMask |= ATTR_OPSIZE;
break;
case VEX_PREFIX_F3:
attrMask |= ATTR_XS;
@@ -749,14 +783,14 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
attrMask |= ATTR_XD;
break;
}
-
+
if (lFromVEX3of3(insn->vexPrefix[2]))
attrMask |= ATTR_VEXL;
}
else if (insn->vexSize == 2) {
switch (ppFromVEX2of2(insn->vexPrefix[1])) {
case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
+ attrMask |= ATTR_OPSIZE;
break;
case VEX_PREFIX_F3:
attrMask |= ATTR_XS;
@@ -765,7 +799,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
attrMask |= ATTR_XD;
break;
}
-
+
if (lFromVEX2of2(insn->vexPrefix[1]))
attrMask |= ATTR_VEXL;
}
@@ -836,26 +870,26 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
* conservative, but in the specific case where OpSize is present but not
* in the right place we check if there's a 16-bit operation.
*/
-
+
const struct InstructionSpecifier *spec;
uint16_t instructionIDWithOpsize;
const char *specName, *specWithOpSizeName;
-
+
spec = specifierForUID(instructionID);
-
+
if (getIDWithAttrMask(&instructionIDWithOpsize,
insn,
attrMask | ATTR_OPSIZE)) {
- /*
+ /*
* ModRM required with OpSize but not present; give up and return version
* without OpSize set
*/
-
+
insn->instructionID = instructionID;
insn->spec = spec;
return 0;
}
-
+
specName = x86DisassemblerGetInstrName(instructionID, miiArg);
specWithOpSizeName =
x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
@@ -882,10 +916,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
const struct InstructionSpecifier *specWithNewOpcode;
spec = specifierForUID(instructionID);
-
+
/* Borrow opcode from one of the other XCHGar opcodes */
insn->opcode = 0x91;
-
+
if (getIDWithAttrMask(&instructionIDWithNewOpcode,
insn,
attrMask)) {
@@ -906,10 +940,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
return 0;
}
-
+
insn->instructionID = instructionID;
insn->spec = specifierForUID(insn->instructionID);
-
+
return 0;
}
@@ -924,14 +958,14 @@ static int readSIB(struct InternalInstruction* insn) {
SIBIndex sibIndexBase = 0;
SIBBase sibBaseBase = 0;
uint8_t index, base;
-
+
dbgprintf(insn, "readSIB()");
-
+
if (insn->consumedSIB)
return 0;
-
+
insn->consumedSIB = TRUE;
-
+
switch (insn->addressSize) {
case 2:
dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
@@ -949,9 +983,9 @@ static int readSIB(struct InternalInstruction* insn) {
if (consumeByte(insn, &insn->sib))
return -1;
-
+
index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
-
+
switch (index) {
case 0x4:
insn->sibIndex = SIB_INDEX_NONE;
@@ -963,7 +997,7 @@ static int readSIB(struct InternalInstruction* insn) {
insn->sibIndex = SIB_INDEX_NONE;
break;
}
-
+
switch (scaleFromSIB(insn->sib)) {
case 0:
insn->sibScale = 1;
@@ -978,9 +1012,9 @@ static int readSIB(struct InternalInstruction* insn) {
insn->sibScale = 8;
break;
}
-
+
base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
-
+
switch (base) {
case 0x5:
switch (modFromModRM(insn->modRM)) {
@@ -990,12 +1024,12 @@ static int readSIB(struct InternalInstruction* insn) {
break;
case 0x1:
insn->eaDisplacement = EA_DISP_8;
- insn->sibBase = (insn->addressSize == 4 ?
+ insn->sibBase = (insn->addressSize == 4 ?
SIB_BASE_EBP : SIB_BASE_RBP);
break;
case 0x2:
insn->eaDisplacement = EA_DISP_32;
- insn->sibBase = (insn->addressSize == 4 ?
+ insn->sibBase = (insn->addressSize == 4 ?
SIB_BASE_EBP : SIB_BASE_RBP);
break;
case 0x3:
@@ -1007,7 +1041,7 @@ static int readSIB(struct InternalInstruction* insn) {
insn->sibBase = (SIBBase)(sibBaseBase + base);
break;
}
-
+
return 0;
}
@@ -1015,22 +1049,22 @@ static int readSIB(struct InternalInstruction* insn) {
* readDisplacement - Consumes the displacement of an instruction.
*
* @param insn - The instruction whose displacement is to be read.
- * @return - 0 if the displacement byte was successfully read; nonzero
+ * @return - 0 if the displacement byte was successfully read; nonzero
* otherwise.
*/
-static int readDisplacement(struct InternalInstruction* insn) {
+static int readDisplacement(struct InternalInstruction* insn) {
int8_t d8;
int16_t d16;
int32_t d32;
-
+
dbgprintf(insn, "readDisplacement()");
-
+
if (insn->consumedDisplacement)
return 0;
-
+
insn->consumedDisplacement = TRUE;
insn->displacementOffset = insn->readerCursor - insn->startLocation;
-
+
switch (insn->eaDisplacement) {
case EA_DISP_NONE:
insn->consumedDisplacement = FALSE;
@@ -1051,7 +1085,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
insn->displacement = d32;
break;
}
-
+
insn->consumedDisplacement = TRUE;
return 0;
}
@@ -1063,22 +1097,22 @@ static int readDisplacement(struct InternalInstruction* insn) {
* @param insn - The instruction whose addressing information is to be read.
* @return - 0 if the information was successfully read; nonzero otherwise.
*/
-static int readModRM(struct InternalInstruction* insn) {
+static int readModRM(struct InternalInstruction* insn) {
uint8_t mod, rm, reg;
-
+
dbgprintf(insn, "readModRM()");
-
+
if (insn->consumedModRM)
return 0;
-
+
if (consumeByte(insn, &insn->modRM))
return -1;
insn->consumedModRM = TRUE;
-
+
mod = modFromModRM(insn->modRM);
rm = rmFromModRM(insn->modRM);
reg = regFromModRM(insn->modRM);
-
+
/*
* This goes by insn->registerSize to pick the correct register, which messes
* up if we're using (say) XMM or 8-bit register operands. That gets fixed in
@@ -1098,16 +1132,16 @@ static int readModRM(struct InternalInstruction* insn) {
insn->eaRegBase = EA_REG_RAX;
break;
}
-
+
reg |= rFromREX(insn->rexPrefix) << 3;
rm |= bFromREX(insn->rexPrefix) << 3;
-
+
insn->reg = (Reg)(insn->regBase + reg);
-
+
switch (insn->addressSize) {
case 2:
insn->eaBaseBase = EA_BASE_BX_SI;
-
+
switch (mod) {
case 0x0:
if (rm == 0x6) {
@@ -1142,14 +1176,14 @@ static int readModRM(struct InternalInstruction* insn) {
case 4:
case 8:
insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
-
+
switch (mod) {
case 0x0:
insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
switch (rm) {
case 0x4:
case 0xc: /* in case REXW.b is set */
- insn->eaBase = (insn->addressSize == 4 ?
+ insn->eaBase = (insn->addressSize == 4 ?
EA_BASE_sib : EA_BASE_sib64);
readSIB(insn);
if (readDisplacement(insn))
@@ -1191,7 +1225,7 @@ static int readModRM(struct InternalInstruction* insn) {
}
break;
} /* switch (insn->addressSize) */
-
+
return 0;
}
@@ -1221,6 +1255,8 @@ static int readModRM(struct InternalInstruction* insn) {
return prefix##_EAX + index; \
case TYPE_R64: \
return prefix##_RAX + index; \
+ case TYPE_XMM512: \
+ return prefix##_ZMM0 + index; \
case TYPE_XMM256: \
return prefix##_YMM0 + index; \
case TYPE_XMM128: \
@@ -1274,12 +1310,12 @@ GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
* @return - 0 if fixup was successful; -1 if the register returned was
* invalid for its class.
*/
-static int fixupReg(struct InternalInstruction *insn,
+static int fixupReg(struct InternalInstruction *insn,
const struct OperandSpecifier *op) {
uint8_t valid;
-
+
dbgprintf(insn, "fixupReg()");
-
+
switch ((OperandEncoding)op->encoding) {
default:
debug("Expected a REG or R/M encoding in fixupReg");
@@ -1311,12 +1347,12 @@ static int fixupReg(struct InternalInstruction *insn,
}
break;
}
-
+
return 0;
}
/*
- * readOpcodeModifier - Reads an operand from the opcode field of an
+ * readOpcodeModifier - Reads an operand from the opcode field of an
* instruction. Handles AddRegFrm instructions.
*
* @param insn - The instruction whose opcode field is to be read.
@@ -1326,12 +1362,12 @@ static int fixupReg(struct InternalInstruction *insn,
*/
static int readOpcodeModifier(struct InternalInstruction* insn) {
dbgprintf(insn, "readOpcodeModifier()");
-
+
if (insn->consumedOpcodeModifier)
return 0;
-
+
insn->consumedOpcodeModifier = TRUE;
-
+
switch (insn->spec->modifierType) {
default:
debug("Unknown modifier type.");
@@ -1345,11 +1381,11 @@ static int readOpcodeModifier(struct InternalInstruction* insn) {
case MODIFIER_MODRM:
insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
return 0;
- }
+ }
}
/*
- * readOpcodeRegister - Reads an operand from the opcode field of an
+ * readOpcodeRegister - Reads an operand from the opcode field of an
* instruction and interprets it appropriately given the operand width.
* Handles AddRegFrm instructions.
*
@@ -1364,39 +1400,39 @@ static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
if (readOpcodeModifier(insn))
return -1;
-
+
if (size == 0)
size = insn->registerSize;
-
+
switch (size) {
case 1:
- insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+ insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
| insn->opcodeModifier));
- if (insn->rexPrefix &&
+ if (insn->rexPrefix &&
insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
insn->opcodeRegister < MODRM_REG_AL + 0x8) {
insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+ (insn->opcodeRegister - MODRM_REG_AL - 4));
}
-
+
break;
case 2:
insn->opcodeRegister = (Reg)(MODRM_REG_AX
- + ((bFromREX(insn->rexPrefix) << 3)
+ + ((bFromREX(insn->rexPrefix) << 3)
| insn->opcodeModifier));
break;
case 4:
insn->opcodeRegister = (Reg)(MODRM_REG_EAX
- + ((bFromREX(insn->rexPrefix) << 3)
+ + ((bFromREX(insn->rexPrefix) << 3)
| insn->opcodeModifier));
break;
case 8:
- insn->opcodeRegister = (Reg)(MODRM_REG_RAX
- + ((bFromREX(insn->rexPrefix) << 3)
+ insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+ + ((bFromREX(insn->rexPrefix) << 3)
| insn->opcodeModifier));
break;
}
-
+
return 0;
}
@@ -1414,20 +1450,20 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
uint16_t imm16;
uint32_t imm32;
uint64_t imm64;
-
+
dbgprintf(insn, "readImmediate()");
-
+
if (insn->numImmediatesConsumed == 2) {
debug("Already consumed two immediates");
return -1;
}
-
+
if (size == 0)
size = insn->immediateSize;
else
insn->immediateSize = size;
insn->immediateOffset = insn->readerCursor - insn->startLocation;
-
+
switch (size) {
case 1:
if (consumeByte(insn, &imm8))
@@ -1450,9 +1486,9 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
insn->immediates[insn->numImmediatesConsumed] = imm64;
break;
}
-
+
insn->numImmediatesConsumed++;
-
+
return 0;
}
@@ -1465,7 +1501,7 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
*/
static int readVVVV(struct InternalInstruction* insn) {
dbgprintf(insn, "readVVVV()");
-
+
if (insn->vexSize == 3)
insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
else if (insn->vexSize == 2)
@@ -1490,14 +1526,14 @@ static int readOperands(struct InternalInstruction* insn) {
int index;
int hasVVVV, needVVVV;
int sawRegImm = 0;
-
+
dbgprintf(insn, "readOperands()");
/* If non-zero vvvv specified, need to make sure one of the operands
uses it. */
hasVVVV = !readVVVV(insn);
needVVVV = hasVVVV && (insn->vvvv != 0);
-
+
for (index = 0; index < X86_MAX_OPERANDS; ++index) {
switch (x86OperandSets[insn->spec->operands][index].encoding) {
case ENCODING_NONE:
@@ -1599,7 +1635,7 @@ static int readOperands(struct InternalInstruction* insn) {
/* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
if (needVVVV) return -1;
-
+
return 0;
}
@@ -1607,7 +1643,7 @@ static int readOperands(struct InternalInstruction* insn) {
* decodeInstruction - Reads and interprets a full instruction provided by the
* user.
*
- * @param insn - A pointer to the instruction to be populated. Must be
+ * @param insn - A pointer to the instruction to be populated. Must be
* pre-allocated.
* @param reader - The function to be used to read the instruction's bytes.
* @param readerArg - A generic argument to be passed to the reader to store
@@ -1632,7 +1668,7 @@ int decodeInstruction(struct InternalInstruction* insn,
uint64_t startLoc,
DisassemblerMode mode) {
memset(insn, 0, sizeof(struct InternalInstruction));
-
+
insn->reader = reader;
insn->readerArg = readerArg;
insn->dlog = logger;
@@ -1641,7 +1677,7 @@ int decodeInstruction(struct InternalInstruction* insn,
insn->readerCursor = startLoc;
insn->mode = mode;
insn->numImmediatesConsumed = 0;
-
+
if (readPrefixes(insn) ||
readOpcode(insn) ||
getID(insn, miiArg) ||
@@ -1650,14 +1686,14 @@ int decodeInstruction(struct InternalInstruction* insn,
return -1;
insn->operands = &x86OperandSets[insn->spec->operands][0];
-
+
insn->length = insn->readerCursor - insn->startLocation;
-
+
dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
startLoc, insn->readerCursor, insn->length);
-
+
if (insn->length > 15)
dbgprintf(insn, "Instruction exceeds 15-byte limit");
-
+
return 0;
}
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 407ead3..dcb6aad 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -219,7 +219,23 @@ extern "C" {
ENTRY(XMM12) \
ENTRY(XMM13) \
ENTRY(XMM14) \
- ENTRY(XMM15)
+ ENTRY(XMM15) \
+ ENTRY(XMM16) \
+ ENTRY(XMM17) \
+ ENTRY(XMM18) \
+ ENTRY(XMM19) \
+ ENTRY(XMM20) \
+ ENTRY(XMM21) \
+ ENTRY(XMM22) \
+ ENTRY(XMM23) \
+ ENTRY(XMM24) \
+ ENTRY(XMM25) \
+ ENTRY(XMM26) \
+ ENTRY(XMM27) \
+ ENTRY(XMM28) \
+ ENTRY(XMM29) \
+ ENTRY(XMM30) \
+ ENTRY(XMM31)
#define REGS_YMM \
ENTRY(YMM0) \
@@ -237,7 +253,57 @@ extern "C" {
ENTRY(YMM12) \
ENTRY(YMM13) \
ENTRY(YMM14) \
- ENTRY(YMM15)
+ ENTRY(YMM15) \
+ ENTRY(YMM16) \
+ ENTRY(YMM17) \
+ ENTRY(YMM18) \
+ ENTRY(YMM19) \
+ ENTRY(YMM20) \
+ ENTRY(YMM21) \
+ ENTRY(YMM22) \
+ ENTRY(YMM23) \
+ ENTRY(YMM24) \
+ ENTRY(YMM25) \
+ ENTRY(YMM26) \
+ ENTRY(YMM27) \
+ ENTRY(YMM28) \
+ ENTRY(YMM29) \
+ ENTRY(YMM30) \
+ ENTRY(YMM31)
+
+#define REGS_ZMM \
+ ENTRY(ZMM0) \
+ ENTRY(ZMM1) \
+ ENTRY(ZMM2) \
+ ENTRY(ZMM3) \
+ ENTRY(ZMM4) \
+ ENTRY(ZMM5) \
+ ENTRY(ZMM6) \
+ ENTRY(ZMM7) \
+ ENTRY(ZMM8) \
+ ENTRY(ZMM9) \
+ ENTRY(ZMM10) \
+ ENTRY(ZMM11) \
+ ENTRY(ZMM12) \
+ ENTRY(ZMM13) \
+ ENTRY(ZMM14) \
+ ENTRY(ZMM15) \
+ ENTRY(ZMM16) \
+ ENTRY(ZMM17) \
+ ENTRY(ZMM18) \
+ ENTRY(ZMM19) \
+ ENTRY(ZMM20) \
+ ENTRY(ZMM21) \
+ ENTRY(ZMM22) \
+ ENTRY(ZMM23) \
+ ENTRY(ZMM24) \
+ ENTRY(ZMM25) \
+ ENTRY(ZMM26) \
+ ENTRY(ZMM27) \
+ ENTRY(ZMM28) \
+ ENTRY(ZMM29) \
+ ENTRY(ZMM30) \
+ ENTRY(ZMM31)
#define REGS_SEGMENT \
ENTRY(ES) \
@@ -285,6 +351,7 @@ extern "C" {
REGS_MMX \
REGS_XMM \
REGS_YMM \
+ REGS_ZMM \
REGS_SEGMENT \
REGS_DEBUG \
REGS_CONTROL \
@@ -319,6 +386,7 @@ typedef enum {
ALL_EA_BASES
REGS_XMM
REGS_YMM
+ REGS_ZMM
#undef ENTRY
SIB_INDEX_max
} SIBIndex;
@@ -457,6 +525,8 @@ struct InternalInstruction {
uint64_t necessaryPrefixLocation;
/* The segment override type */
SegmentOverride segmentOverride;
+ /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
+ BOOL xAcquireRelease;
/* Sizes of various critical pieces of data, in bytes */
uint8_t registerSize;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 23dfe4b..d291441 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -116,8 +116,106 @@ enum attributeBits {
ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
- ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize")
-
+ ENUM_ENTRY(IC_VEX_L_W, 3, "requires VEX, L and W") \
+ ENUM_ENTRY(IC_VEX_L_W_XS, 4, "requires VEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_XD, 4, "requires VEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 4, "requires VEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \
+ ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and OpSize")
#define ENUM_ENTRY(n, r, d) n,
typedef enum {
@@ -224,6 +322,7 @@ struct ContextDecision {
ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
+ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \
ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \
ENUM_ENTRY(ENCODING_CW, "2-byte") \
ENUM_ENTRY(ENCODING_CD, "4-byte") \
@@ -321,6 +420,9 @@ struct ContextDecision {
ENUM_ENTRY(TYPE_XMM64, "8-byte") \
ENUM_ENTRY(TYPE_XMM128, "16-byte") \
ENUM_ENTRY(TYPE_XMM256, "32-byte") \
+ ENUM_ENTRY(TYPE_XMM512, "64-byte") \
+ ENUM_ENTRY(TYPE_VK8, "8-bit") \
+ ENUM_ENTRY(TYPE_VK16, "16-bit") \
ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \
ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index e357710..b9d0082 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// Try to print any aliases first.
if (!printAliasInstr(MI, OS))
printInstruction(MI, OS);
-
+
// Next always print the annotation.
printAnnotation(OS, Annot);
@@ -139,8 +139,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
int64_t Address;
if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
+ O << formatHex((uint64_t)Address);
}
else {
// Otherwise, just print the expression.
@@ -159,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
O << markup("<imm:")
<< '$' << formatImm((int64_t)Op.getImm())
<< markup(">");
-
+
if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
*CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
-
+
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
O << markup("<imm:")
@@ -177,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
const MCOperand &IndexReg = MI->getOperand(Op+2);
const MCOperand &DispSpec = MI->getOperand(Op+3);
const MCOperand &SegReg = MI->getOperand(Op+4);
-
+
O << markup("<mem:");
// If this has a segment register, print it.
@@ -185,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
printOperand(MI, Op+4, O);
O << ':';
}
-
+
if (DispSpec.isImm()) {
int64_t DispVal = DispSpec.getImm();
if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
@@ -194,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
O << *DispSpec.getExpr();
}
-
+
if (IndexReg.getReg() || BaseReg.getReg()) {
O << '(';
if (BaseReg.getReg())
printOperand(MI, Op, O);
-
+
if (IndexReg.getReg()) {
O << ',';
printOperand(MI, Op+2, O);
unsigned ScaleVal = MI->getOperand(Op+1).getImm();
if (ScaleVal != 1) {
O << ','
- << markup("<imm:")
+ << markup("<imm:")
<< ScaleVal // never printed in hex.
- << markup(">");
+ << markup(">");
}
}
O << ')';
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8e09183..8d05256 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -65,6 +65,9 @@ public:
void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
@@ -80,6 +83,9 @@ public:
void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
};
}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 141f4a4..9dfc9a9 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -119,7 +119,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isImm())
- O << Op.getImm();
+ O << formatImm(Op.getImm());
else {
assert(Op.isExpr() && "unknown pcrel immediate operand");
// If a symbolic branch target was added as a constant expression then print
@@ -127,8 +127,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
int64_t Address;
if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
+ O << formatHex((uint64_t)Address);
}
else {
// Otherwise, just print the expression.
@@ -137,18 +136,13 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
}
}
-static void PrintRegName(raw_ostream &O, StringRef RegName) {
- for (unsigned i = 0, e = RegName.size(); i != e; ++i)
- O << (char)toupper(RegName[i]);
-}
-
void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
- PrintRegName(O, getRegisterName(Op.getReg()));
+ printRegName(O, Op.getReg());
} else if (Op.isImm()) {
- O << Op.getImm();
+ O << formatImm((int64_t)Op.getImm());
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
O << *Op.getExpr();
@@ -200,7 +194,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
DispVal = -DispVal;
}
}
- O << DispVal;
+ O << formatImm(DispVal);
}
}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index bb769eb..45beeda 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -41,52 +41,60 @@ public:
void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "OPAQUE PTR ";
+ O << "opaque ptr ";
printMemReference(MI, OpNo, O);
}
void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "BYTE PTR ";
+ O << "byte ptr ";
printMemReference(MI, OpNo, O);
}
void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "WORD PTR ";
+ O << "word ptr ";
printMemReference(MI, OpNo, O);
}
void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "DWORD PTR ";
+ O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "QWORD PTR ";
+ O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XMMWORD PTR ";
+ O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "YMMWORD PTR ";
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "DWORD PTR ";
+ O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "QWORD PTR ";
+ O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XWORD PTR ";
+ O << "xword ptr ";
printMemReference(MI, OpNo, O);
}
void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XMMWORD PTR ";
+ O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "YMMWORD PTR ";
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
};
diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk
index 39478b3..31ba842 100644
--- a/lib/Target/X86/MCTargetDesc/Android.mk
+++ b/lib/Target/X86/MCTargetDesc/Android.mk
@@ -8,9 +8,11 @@ x86_mc_desc_TBLGEN_TABLES := \
x86_mc_desc_SRC_FILES := \
X86AsmBackend.cpp \
X86ELFObjectWriter.cpp \
+ X86ELFRelocationInfo.cpp \
X86MCTargetDesc.cpp \
X86MCAsmInfo.cpp \
X86MCCodeEmitter.cpp \
+ X86MachORelocationInfo.cpp \
X86MachObjectWriter.cpp \
X86WinCOFFObjectWriter.cpp
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 1c240e5..2eb5f25 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -6,6 +6,8 @@ add_llvm_library(LLVMX86Desc
X86MachObjectWriter.cpp
X86ELFObjectWriter.cpp
X86WinCOFFObjectWriter.cpp
+ X86MachORelocationInfo.cpp
+ X86ELFRelocationInfo.cpp
)
add_dependencies(LLVMX86Desc X86CommonTableGen)
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 9e68388..25d1af3 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -20,6 +20,7 @@
#include "X86MCTargetDesc.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCInstrInfo.h"
namespace llvm {
@@ -41,7 +42,6 @@ namespace X86 {
AddrNumOperands = 5
};
} // end namespace X86;
-
/// X86II - This namespace holds all of the target specific flags that
/// instruction info tracks.
@@ -274,11 +274,12 @@ namespace X86II {
//// MRM_XX - A mod/rm byte of exactly 0xXX.
MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
- MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
- MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
- MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50,
- MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54,
- MRM_DE = 55, MRM_DF = 56,
+ MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, MRM_CB = 40,
+ MRM_E8 = 41, MRM_F0 = 42, MRM_F8 = 45, MRM_F9 = 46,
+ MRM_D0 = 47, MRM_D1 = 48, MRM_D4 = 49, MRM_D5 = 50,
+ MRM_D6 = 51, MRM_D8 = 52, MRM_D9 = 53, MRM_DA = 54,
+ MRM_DB = 55, MRM_DC = 56, MRM_DD = 57, MRM_DE = 58,
+ MRM_DF = 59,
/// RawFrmImm8 - This is used for the ENTER instruction, which has two
/// immediates, the first of which is a 16-bit immediate (specified by
@@ -461,20 +462,54 @@ namespace X86II {
// prefix. Usually used for scalar instructions. Needed by disassembler.
VEX_LIG = 1U << 6,
+ // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+ // with following encoding:
+ // - 00 V128
+ // - 01 V256
+ // - 10 V512
+ // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+ // this will save 1 tsflag bit
+
+ // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+ // syntax support up to 32 512-bit register operands and up to 7 16-bit
+ // mask operands as well as source operand data swizzling/memory operand
+ // conversion, eviction hint, and rounding mode.
+ EVEX = 1U << 7,
+
+ // EVEX_K - Set if this instruction requires masking
+ EVEX_K = 1U << 8,
+
+ // EVEX_Z - Set if this instruction has EVEX.Z field set.
+ EVEX_Z = 1U << 9,
+
+ // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+ EVEX_L2 = 1U << 10,
+
+ // EVEX_B - Set if this instruction has EVEX.B field set.
+ EVEX_B = 1U << 11,
+
+ // EVEX_CD8E - compressed disp8 form, element-size
+ EVEX_CD8EShift = VEXShift + 12,
+ EVEX_CD8EMask = 3,
+
+ // EVEX_CD8V - compressed disp8 form, vector-width
+ EVEX_CD8VShift = EVEX_CD8EShift + 2,
+ EVEX_CD8VMask = 7,
+
/// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
/// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
/// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
/// storing a classifier in the imm8 field. To simplify our implementation,
/// we handle this by storeing the classifier in the opcode field and using
/// this flag to indicate that the encoder should do the wacky 3DNow! thing.
- Has3DNow0F0FOpcode = 1U << 7,
+ Has3DNow0F0FOpcode = 1U << 17,
/// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
/// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
- MemOp4 = 1U << 8,
+ MemOp4 = 1U << 18,
/// XOP - Opcode prefix used by XOP instructions.
- XOP = 1U << 9
+ XOP = 1U << 19
};
@@ -521,6 +556,33 @@ namespace X86II {
}
}
+ /// getOperandBias - compute any additional adjustment needed to
+ /// the offset to the start of the memory operand
+ /// in this instruction.
+ /// If this is a two-address instruction,skip one of the register operands.
+ /// FIXME: This should be handled during MCInst lowering.
+ inline int getOperandBias(const MCInstrDesc& Desc)
+ {
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ ++CurOp;
+ return CurOp;
+ }
+
/// getMemoryOperandNo - The function returns the MCInst operand # for the
/// first field of the memory operand. If the instruction doesn't have a
/// memory operand, this returns -1.
@@ -548,12 +610,15 @@ namespace X86II {
case X86II::MRMSrcMem: {
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
if (HasMemOp4)
++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
-
+ if (HasEVEX_K)
+ ++FirstMemOp;// Skip the mask register
// FIXME: Maybe lea should have its own form? This is a horrible hack.
//if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
// Opcode == X86::LEA16r || Opcode == X86::LEA32r)
@@ -574,17 +639,15 @@ namespace X86II {
++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
return FirstMemOp;
}
- case X86II::MRM_C1: case X86II::MRM_C2:
- case X86II::MRM_C3: case X86II::MRM_C4:
- case X86II::MRM_C8: case X86II::MRM_C9:
- case X86II::MRM_E8: case X86II::MRM_F0:
- case X86II::MRM_F8: case X86II::MRM_F9:
- case X86II::MRM_D0: case X86II::MRM_D1:
- case X86II::MRM_D4: case X86II::MRM_D5:
- case X86II::MRM_D8: case X86II::MRM_D9:
- case X86II::MRM_DA: case X86II::MRM_DB:
- case X86II::MRM_DC: case X86II::MRM_DD:
- case X86II::MRM_DE: case X86II::MRM_DF:
+ case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+ case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_E8:
+ case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9:
+ case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+ case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+ case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+ case X86II::MRM_DF:
return -1;
}
}
@@ -592,6 +655,14 @@ namespace X86II {
/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
/// higher) register? e.g. r8, xmm8, xmm13, etc.
inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+ (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+ (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+ (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+ return true;
+
switch (RegNo) {
default: break;
case X86::R8: case X86::R9: case X86::R10: case X86::R11:
@@ -602,16 +673,21 @@ namespace X86II {
case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
- case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
- case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
- case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
- case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
return true;
}
return false;
}
+
+ /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+ /// registers? e.g. zmm21, etc.
+ static inline bool is32ExtendedReg(unsigned RegNo) {
+ return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+ }
+
inline bool isX86_64NonExtLowByteReg(unsigned reg) {
return (reg == X86::SPL || reg == X86::BPL ||
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index de80dd8..b400b87 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -101,7 +101,18 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
} else {
switch ((unsigned)Fixup.getKind()) {
default: llvm_unreachable("invalid fixup kind!");
- case FK_Data_8: Type = ELF::R_X86_64_64; break;
+ case FK_Data_8:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_64;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_X86_64_DTPOFF64;
+ break;
+ }
+ break;
case X86::reloc_signed_4byte:
switch (Modifier) {
default:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..8f4ab46
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,135 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+ X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+ const MCExpr *createExprForRelocation(RelocationRef Rel) {
+ uint64_t RelType; Rel.getType(RelType);
+ symbol_iterator SymI = Rel.getSymbol();
+
+ StringRef SymName; SymI->getName(SymName);
+ uint64_t SymAddr; SymI->getAddress(SymAddr);
+ uint64_t SymSize; SymI->getSize(SymSize);
+ int64_t Addend; getELFRelocationAddend(Rel, Addend);
+
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+ // FIXME: check that the value is actually the same.
+ if (Sym->isVariable() == false)
+ Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+
+ const MCExpr *Expr = 0;
+ // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+ bool hasAddend = false;
+
+ // The AMD64 SysV ABI says:
+ // A: the addend used to compute the value of the relocatable field.
+ // B: the base address at which a shared object has been loaded into memory
+ // during execution. Generally, a shared object is built with a 0 base
+ // virtual address, but the execution address will be different.
+ // G: the offset into the global offset table at which the relocation
+ // entry's symbol will reside during execution.
+ // GOT: the address of the global offset table.
+ // L: the place (section offset or address) of the Procedure Linkage Table
+ // entry for a symbol.
+ // P: the place (section offset or address) of the storage unit being
+ // relocated (computed using r_offset).
+ // S: the value of the symbol whose index resides in the relocation entry.
+ // Z: the size of the symbol whose index resides in the relocation entry.
+
+ switch(RelType) {
+ case R_X86_64_NONE:
+ case R_X86_64_COPY:
+ // none
+ break;
+ case R_X86_64_64:
+ case R_X86_64_16:
+ case R_X86_64_8:
+ // S + A
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ // S + A (We don't care about the result not fitting in 32 bits.)
+ case R_X86_64_PC32:
+ case R_X86_64_PC16:
+ case R_X86_64_PC8:
+ case R_X86_64_PC64:
+ // S + A - P (P/pcrel is implicit)
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ break;
+ case R_X86_64_GOT32:
+ case R_X86_64_GOT64:
+ case R_X86_64_GOTPC32:
+ case R_X86_64_GOTPC64:
+ case R_X86_64_GOTPLT64:
+ // G + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+ break;
+ case R_X86_64_PLT32:
+ // L + A - P -> S@PLT + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+ break;
+ case R_X86_64_GLOB_DAT:
+ case R_X86_64_JUMP_SLOT:
+ // S
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ break;
+ case R_X86_64_GOTPCREL:
+ case R_X86_64_GOTPCREL64:
+ // G + GOT + A - P -> S@GOTPCREL + A
+ hasAddend = true;
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ break;
+ case R_X86_64_GOTOFF64:
+ // S + A - GOT
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+ break;
+ case R_X86_64_PLTOFF64:
+ // L + A - GOT
+ break;
+ case R_X86_64_SIZE32:
+ case R_X86_64_SIZE64:
+ // Z + A
+ Expr = MCConstantExpr::Create(SymSize, Ctx);
+ break;
+ default:
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ break;
+ }
+ if (Expr && hasAddend && Addend != 0)
+ Expr = MCBinaryExpr::CreateAdd(Expr,
+ MCConstantExpr::Create(Addend, Ctx),
+ Ctx);
+ return Expr;
+ }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+ // We only handle x86-64 for now.
+ return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 5fbefae..8515879 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -53,7 +53,7 @@ public:
}
unsigned GetX86RegNum(const MCOperand &MO) const {
- return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
}
// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -77,6 +77,14 @@ public:
return (~SrcRegNum) & 0xf;
}
+ unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+ "Invalid mask register as write-mask!");
+ unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+ return MaskRegNum;
+ }
+
void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
OS << (char)C;
++CurByte;
@@ -152,6 +160,52 @@ static bool isDisp8(int Value) {
return Value == (signed char)Value;
}
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+ assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) &&
+ "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+ unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask;
+ unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask;
+
+ if (CD8V == 0 && CD8E == 0) {
+ CValue = Value;
+ return isDisp8(Value);
+ }
+
+ unsigned MemObjSize = 1U << CD8E;
+ if (CD8V & 4) {
+ // Fixed vector length
+ MemObjSize *= 1U << (CD8V & 0x3);
+ } else {
+ // Modified vector length
+ bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B;
+ if (!EVEX_b) {
+ unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0;
+ EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0;
+ assert(EVEX_LL < 3 && "");
+
+ unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize;
+ NumElems /= 1U << (CD8V & 0x3);
+
+ MemObjSize *= NumElems;
+ }
+ }
+
+ unsigned MemObjMask = MemObjSize - 1;
+ assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size.");
+
+ if (Value & MemObjMask) // Unaligned offset
+ return false;
+ Value /= MemObjSize;
+ bool Ret = (Value == (signed char)Value);
+
+ if (Ret)
+ CValue = Value;
+ return Ret;
+}
+
/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
/// in an instruction with the specified TSFlags.
static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
@@ -237,6 +291,14 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) {
return GOT_Normal;
}
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+ if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+ }
+ return false;
+}
+
void X86MCCodeEmitter::
EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
@@ -268,8 +330,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
if (Kind == GOT_Normal)
ImmOffset = CurByte;
} else if (Expr->getKind() == MCExpr::SymbolRef) {
- const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
- if (Ref->getKind() == MCSymbolRefExpr::VK_SECREL) {
+ if (HasSecRelSymbolRef(Expr)) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
+ } else if (Expr->getKind() == MCExpr::Binary) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+ if (HasSecRelSymbolRef(Bin->getLHS())
+ || HasSecRelSymbolRef(Bin->getRHS())) {
FixupKind = MCFixupKind(FK_SecRel_4);
}
}
@@ -305,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
unsigned BaseReg = Base.getReg();
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
// Handle %rip relative addressing.
if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
@@ -365,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
- if (Disp.isImm() && isDisp8(Disp.getImm())) {
- EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
- return;
+ if (Disp.isImm()) {
+ if (!HasEVEX && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+ // 32-bit displacement.
+ int CDisp8 = 0;
+ if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ CDisp8 - Disp.getImm());
+ return;
+ }
}
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
@@ -384,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
bool ForceDisp32 = false;
bool ForceDisp8 = false;
+ int CDisp8 = 0;
+ int ImmOffset = 0;
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
@@ -399,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
- } else if (isDisp8(Disp.getImm())) {
+ } else if (!HasEVEX && isDisp8(Disp.getImm())) {
// Emit the disp8 encoding.
EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ ImmOffset = CDisp8 - Disp.getImm();
} else {
// Emit the normal disp32 encoding.
EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
@@ -432,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// Do we need to output a displacement?
if (ForceDisp8)
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
else if (ForceDisp32 || Disp.getImm() != 0)
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
CurByte, OS, Fixups);
@@ -444,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
int MemOperand, const MCInst &MI,
const MCInstrDesc &Desc,
raw_ostream &OS) const {
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
@@ -455,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0: Same as REX_R=1 (64 bit mode only)
//
unsigned char VEX_R = 0x1;
+ unsigned char EVEX_R2 = 0x1;
// VEX_X: equivalent to REX.X, only used when a
// register is used for index in SIB Byte.
@@ -491,6 +580,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// VEX_4V (VEX vvvv field): a register specifier
// (in 1's complement form) or 1111 if unused.
unsigned char VEX_4V = 0xf;
+ unsigned char EVEX_V2 = 0x1;
// VEX_L (Vector Length):
//
@@ -498,6 +588,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 1: 256-bit vector
//
unsigned char VEX_L = 0;
+ unsigned char EVEX_L2 = 0;
// VEX_PP: opcode extension providing equivalent
// functionality of a SIMD prefix
@@ -509,6 +600,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
//
unsigned char VEX_PP = 0;
+ // EVEX_U
+ unsigned char EVEX_U = 1; // Always '1' so far
+
+ // EVEX_z
+ unsigned char EVEX_z = 0;
+
+ // EVEX_b
+ unsigned char EVEX_b = 0;
+
+ // EVEX_aaa
+ unsigned char EVEX_aaa = 0;
+
// Encode the operand size opcode prefix as needed.
if (TSFlags & X86II::OpSize)
VEX_PP = 0x01;
@@ -521,6 +624,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
VEX_L = 1;
+ if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+ EVEX_L2 = 1;
+
+ if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
+ EVEX_z = 1;
+
+ if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+ EVEX_b = 1;
switch (TSFlags & X86II::Op0Mask) {
default: llvm_unreachable("Invalid prefix!");
@@ -567,12 +678,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned CurOp = 0;
if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
++CurOp;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
- assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
// Special case for GATHER with 2 TIED_TO operands
// Skip the first 2 operands: dst, mask_wb
CurOp += 2;
- }
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ ++CurOp;
switch (TSFlags & X86II::FormMask) {
case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
@@ -582,18 +700,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// MemAddr, src1(VEX_4V), src2(ModR/M)
// MemAddr, src1(ModR/M), imm8
//
- if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrBaseReg).getReg()))
VEX_B = 0x0;
- if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
VEX_X = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
+
+ CurOp += X86::AddrNumOperands;
- CurOp = X86::AddrNumOperands;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
const MCOperand &MO = MI.getOperand(CurOp);
- if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
- VEX_R = 0x0;
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MO.getReg()))
+ EVEX_R2 = 0x0;
+ }
break;
}
case X86II::MRMSrcMem:
@@ -606,11 +741,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// FMA4:
// dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
// dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
- if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
- if (HasVEX_4V)
+ if (HasVEX_4V) {
VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -618,6 +763,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
VEX_X = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
if (HasVEX_4VOp3)
// Instruction format for 4VOp3:
@@ -634,8 +782,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// MRM[0-9]m instructions forms:
// MemAddr
// src1(VEX_4V), MemAddr
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, 0);
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ }
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -656,16 +811,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
CurOp++;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (HasMemOp4) // Skip second register source (encoded in I8IMM)
CurOp++;
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
CurOp++;
if (HasVEX_4VOp3)
VEX_4V = getVEXRegisterEncoding(MI, CurOp);
@@ -677,13 +843,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// dst(ModR/M), src1(VEX_4V), src2(ModR/M)
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
CurOp++;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
break;
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
@@ -691,9 +868,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
case X86II::MRM6r: case X86II::MRM7r:
// MRM0r-MRM7r instructions forms:
// dst(VEX_4V), src(ModR/M), imm8
- VEX_4V = getVEXRegisterEncoding(MI, 0);
- if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
break;
default: // RawFrm
break;
@@ -702,29 +888,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// Emit segment override opcode prefix as needed.
EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
- // VEX opcode prefix can have 2 or 3 bytes
- //
- // 3 bytes:
- // +-----+ +--------------+ +-------------------+
- // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
- // +-----+ +--------------+ +-------------------+
- // 2 bytes:
- // +-----+ +-------------------+
- // | C5h | | R | vvvv | L | pp |
- // +-----+ +-------------------+
- //
- unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+ if (!HasEVEX) {
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
- if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
- EmitByte(0xC5, CurByte, OS);
- EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
- return;
- }
+ if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+ EmitByte(0xC5, CurByte, OS);
+ EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ return;
+ }
- // 3 byte VEX prefix
- EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
- EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
- EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ // 3 byte VEX prefix
+ EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+ EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ } else {
+ // EVEX opcode prefix can have 4 bytes
+ //
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ assert((VEX_5M & 0x3) == VEX_5M
+ && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ VEX_5M &= 0x3;
+
+ EmitByte(0x62, CurByte, OS);
+ EmitByte((VEX_R << 7) |
+ (VEX_X << 6) |
+ (VEX_B << 5) |
+ (EVEX_R2 << 4) |
+ VEX_5M, CurByte, OS);
+ EmitByte((VEX_W << 7) |
+ (VEX_4V << 3) |
+ (EVEX_U << 2) |
+ VEX_PP, CurByte, OS);
+ EmitByte((EVEX_z << 7) |
+ (EVEX_L2 << 6) |
+ (VEX_L << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ }
}
/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
@@ -979,18 +1194,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
return;
- // If this is a two-address instruction, skip one of the register operands.
- // FIXME: This should be handled during MCInst lowering.
unsigned NumOps = Desc.getNumOperands();
- unsigned CurOp = 0;
- if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
- ++CurOp;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
- assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
- // Special case for GATHER with 2 TIED_TO operands
- // Skip the first 2 operands: dst, mask_wb
- CurOp += 2;
- }
+ unsigned CurOp = X86II::getOperandBias(Desc);
// Keep track of the current byte being emitted.
unsigned CurByte = 0;
@@ -1004,6 +1209,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
const unsigned MemOp4_I8IMMOperand = 2;
+ // It uses the EVEX.aaa field?
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+
// Determine where the memory operand starts, if present.
int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
if (MemoryOperand != -1) MemoryOperand += CurOp;
@@ -1054,6 +1263,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + 1;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1066,6 +1278,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + X86::AddrNumOperands;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1079,6 +1294,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + 1;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1097,6 +1315,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRMSrcMem: {
int AddrOperands = X86::AddrNumOperands;
unsigned FirstMemOp = CurOp+1;
+
+ if (HasEVEX_K) { // Skip writemask
+ ++AddrOperands;
+ ++FirstMemOp;
+ }
+
if (HasVEX_4V) {
++AddrOperands;
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
@@ -1136,17 +1360,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
TSFlags, CurByte, OS, Fixups);
CurOp += X86::AddrNumOperands;
break;
- case X86II::MRM_C1: case X86II::MRM_C2:
- case X86II::MRM_C3: case X86II::MRM_C4:
- case X86II::MRM_C8: case X86II::MRM_C9:
- case X86II::MRM_D0: case X86II::MRM_D1:
- case X86II::MRM_D4: case X86II::MRM_D5:
- case X86II::MRM_D8: case X86II::MRM_D9:
- case X86II::MRM_DA: case X86II::MRM_DB:
- case X86II::MRM_DC: case X86II::MRM_DD:
- case X86II::MRM_DE: case X86II::MRM_DF:
- case X86II::MRM_E8: case X86II::MRM_F0:
- case X86II::MRM_F8: case X86II::MRM_F9:
+ case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+ case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_D0:
+ case X86II::MRM_D1: case X86II::MRM_D4: case X86II::MRM_D5:
+ case X86II::MRM_D6: case X86II::MRM_D8: case X86II::MRM_D9:
+ case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
+ case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
+ case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8:
+ case X86II::MRM_F9:
EmitByte(BaseOpcode, CurByte, OS);
unsigned char MRM;
@@ -1158,10 +1380,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM_C4: MRM = 0xC4; break;
case X86II::MRM_C8: MRM = 0xC8; break;
case X86II::MRM_C9: MRM = 0xC9; break;
+ case X86II::MRM_CA: MRM = 0xCA; break;
+ case X86II::MRM_CB: MRM = 0xCB; break;
case X86II::MRM_D0: MRM = 0xD0; break;
case X86II::MRM_D1: MRM = 0xD1; break;
case X86II::MRM_D4: MRM = 0xD4; break;
case X86II::MRM_D5: MRM = 0xD5; break;
+ case X86II::MRM_D6: MRM = 0xD6; break;
case X86II::MRM_D8: MRM = 0xD8; break;
case X86II::MRM_D9: MRM = 0xD9; break;
case X86II::MRM_DA: MRM = 0xDA; break;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5e84530..bd23ce4 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -263,7 +263,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
return X;
}
-static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
Triple TheTriple(TT);
bool is64Bit = TheTriple.getArch() == Triple::x86_64;
@@ -290,14 +290,16 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
int stackGrowth = is64Bit ? -8 : -4;
// Initial state of the frame pointer is esp+stackGrowth.
- MachineLocation Dst(MachineLocation::VirtualFP);
- MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
- MAI->addInitialFrameState(0, Dst, Src);
+ unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+ MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+ 0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+ MAI->addInitialFrameState(Inst);
// Add return address to move list
- MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
- MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
- MAI->addInitialFrameState(0, CSDst, CSSrc);
+ unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+ MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+ 0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+ MAI->addInitialFrameState(Inst2);
return MAI;
}
@@ -382,6 +384,17 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
return 0;
}
+static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
+ MCContext &Ctx) {
+ Triple TheTriple(TT);
+ if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64)
+ return createX86_64MachORelocationInfo(Ctx);
+ else if (TheTriple.isOSBinFormatELF())
+ return createX86_64ELFRelocationInfo(Ctx);
+ // Default to the stock relocation info.
+ return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
return new MCInstrAnalysis(Info);
}
@@ -439,4 +452,10 @@ extern "C" void LLVMInitializeX86TargetMC() {
createX86MCInstPrinter);
TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
createX86MCInstPrinter);
+
+ // Register the MC relocation info.
+ TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target,
+ createX86MCRelocationInfo);
+ TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target,
+ createX86MCRelocationInfo);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 981aa1a..2f459b4 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -25,6 +25,7 @@ class MCInstrInfo;
class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
+class MCRelocationInfo;
class Target;
class StringRef;
class raw_ostream;
@@ -94,6 +95,12 @@ MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
uint16_t EMachine);
/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+
+/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
} // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..75b5acf
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,116 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace macho;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+ X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+ const MCExpr *createExprForRelocation(RelocationRef Rel) {
+ const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile());
+
+ uint64_t RelType; Rel.getType(RelType);
+ symbol_iterator SymI = Rel.getSymbol();
+
+ StringRef SymName; SymI->getName(SymName);
+ uint64_t SymAddr; SymI->getAddress(SymAddr);
+
+ RelocationEntry RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+ bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+ // FIXME: check that the value is actually the same.
+ if (Sym->isVariable() == false)
+ Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+ const MCExpr *Expr = 0;
+
+ switch(RelType) {
+ case RIT_X86_64_TLV:
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+ break;
+ case RIT_X86_64_Signed4:
+ Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+ MCConstantExpr::Create(4, Ctx),
+ Ctx);
+ break;
+ case RIT_X86_64_Signed2:
+ Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+ MCConstantExpr::Create(2, Ctx),
+ Ctx);
+ break;
+ case RIT_X86_64_Signed1:
+ Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+ MCConstantExpr::Create(1, Ctx),
+ Ctx);
+ break;
+ case RIT_X86_64_GOTLoad:
+ Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+ break;
+ case RIT_X86_64_GOT:
+ Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
+ MCSymbolRefExpr::VK_GOTPCREL :
+ MCSymbolRefExpr::VK_GOT,
+ Ctx);
+ break;
+ case RIT_X86_64_Subtractor:
+ {
+ RelocationRef RelNext;
+ Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
+ RelocationEntry RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+
+ // X86_64_SUBTRACTOR must be followed by a relocation of type
+ // X86_64_RELOC_UNSIGNED .
+ // NOTE: Scattered relocations don't exist on x86_64.
+ unsigned RType = Obj->getAnyRelocationType(RENext);
+ if (RType != RIT_X86_64_Unsigned)
+ report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+ "X86_64_RELOC_SUBTRACTOR.");
+
+ const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
+
+ symbol_iterator RSymI = RelNext.getSymbol();
+ uint64_t RSymAddr;
+ RSymI->getAddress(RSymAddr);
+ StringRef RSymName;
+ RSymI->getName(RSymName);
+
+ MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName);
+ if (RSym->isVariable() == false)
+ RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx));
+
+ const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx);
+
+ Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+ break;
+ }
+ default:
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ break;
+ }
+ return Expr;
+ }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+ return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index bc272ef..ed64a32 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -9,6 +9,8 @@
#include "MCTargetDesc/X86FixupKinds.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCWinCOFFObjectWriter.h"
#include "llvm/Support/COFF.h"
#include "llvm/Support/ErrorHandling.h"
@@ -27,7 +29,9 @@ namespace {
X86WinCOFFObjectWriter(bool Is64Bit_);
~X86WinCOFFObjectWriter();
- virtual unsigned getRelocType(unsigned FixupKind) const;
+ virtual unsigned getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection) const LLVM_OVERRIDE;
};
}
@@ -38,7 +42,14 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
-unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const {
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsCrossSection) const {
+ unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
switch (FixupKind) {
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
@@ -46,6 +57,9 @@ unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const {
return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
case FK_Data_4:
case X86::reloc_signed_4byte:
+ if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB :
+ COFF::IMAGE_REL_I386_DIR32NB;
return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
case FK_Data_8:
if (Is64Bit)
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 496b704..adfa7fa 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -517,37 +517,6 @@ to <2 x i64> ops being so bad.
//===---------------------------------------------------------------------===//
-'select' on vectors and scalars could be a whole lot better. We currently
-lower them to conditional branches. On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
- ucomisd %xmm0, %xmm1
- ja LBB1_2 # entry
-LBB1_1: # entry
- movapd %xmm3, %xmm2
-LBB1_2: # entry
- movapd %xmm2, %xmm0
- ret
-
-instead of:
-
-_test:
- cmpltsd %xmm1, %xmm0
- andpd %xmm0, %xmm2
- andnpd %xmm3, %xmm0
- orpd %xmm2, %xmm0
- ret
-
-For unpredictable branches, the later is much more efficient. This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
LLVM currently generates stack realignment code, when it is not necessary
needed. The problem is that we need to know about stack alignment too early,
before RA runs.
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1f9919f..947002f 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -69,6 +69,11 @@ ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
/// createX86PadShortFunctions - Return a pass that pads short functions
/// with NOOPs. This will prevent a stall when returning on the Atom.
FunctionPass *createX86PadShortFunctions();
+/// createX86FixupLEAs - Return a a pass that selectively replaces
+/// certain instructions (like add, sub, inc, dec, some shifts,
+/// and some multiplies) by equivalent LEA instructions, in order
+/// to eliminate execution delays in some Atom processors.
+FunctionPass *createX86FixupLEAs();
} // End llvm namespace
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 0216252..461ea9b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -86,6 +86,19 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
+def FeatureAVX512 : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512",
+ "Enable AVX-512 instructions",
+ [FeatureAVX2]>;
+def FeatureERI : SubtargetFeature<"avx-512-eri", "HasERI", "true",
+ "Enable AVX-512 Exponential and Reciprocal Instructions",
+ [FeatureAVX512]>;
+def FeatureCDI : SubtargetFeature<"avx-512-cdi", "HasCDI", "true",
+ "Enable AVX-512 Conflict Detection Instructions",
+ [FeatureAVX512]>;
+def FeaturePFI : SubtargetFeature<"avx-512-pfi", "HasPFI", "true",
+ "Enable AVX-512 PreFetch Instructions",
+ [FeatureAVX512]>;
+
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
@@ -120,8 +133,14 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
"Support BMI2 instructions">;
def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
"Support RTM instructions">;
+def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true",
+ "Support HLE">;
def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
"Support ADX instructions">;
+def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+ "Support PRFCHW instructions">;
+def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+ "Support RDSEED instruction">;
def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
@@ -130,6 +149,11 @@ def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+ "CallRegIndirect", "true",
+ "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+ "LEA instruction needs inputs at AG stage">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -143,9 +167,6 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
-class AtomProc<string Name, list<SubtargetFeature> Features>
- : ProcessorModel<Name, AtomModel, Features>;
-
def : Proc<"generic", []>;
def : Proc<"i386", []>;
def : Proc<"i486", []>;
@@ -162,46 +183,71 @@ def : Proc<"pentium4", [FeatureSSE2]>;
def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
FeatureFastUAMem]>;
-def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
-def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
-def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
- FeatureSlowBTMem]>;
-def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
- FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
- FeatureSlowDivide, FeaturePadShortFunctions]>;
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+ [FeatureSSE3, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel,
+ [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"penryn", SandyBridgeModel,
+ [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Atom.
+def : ProcessorModel<"atom", AtomModel,
+ [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+ FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+ FeatureSlowDivide,
+ FeatureCallRegIndirect,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions]>;
+
// "Arrandale" along with corei3 and corei5
-def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES]>;
-def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem,
- FeaturePOPCNT]>;
+def : ProcessorModel<"corei7", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>;
+
+def : ProcessorModel<"nehalem", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT]>;
// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+def : ProcessorModel<"westmere", SandyBridgeModel,
+ [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+ FeatureFastUAMem, FeaturePOPCNT, FeatureAES,
+ FeaturePCLMUL]>;
// Sandy Bridge
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
-def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+def : ProcessorModel<"corei7-avx", SandyBridgeModel,
+ [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
// Ivy Bridge
-def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL,
- FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>;
+def : ProcessorModel<"core-avx-i", SandyBridgeModel,
+ [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+ FeatureF16C, FeatureFSGSBase]>;
// Haswell
-def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL,
- FeatureRDRAND, FeatureF16C, FeatureFSGSBase,
- FeatureMOVBE, FeatureLZCNT, FeatureBMI,
- FeatureBMI2, FeatureFMA,
- FeatureRTM]>;
+def : ProcessorModel<"core-avx2", HaswellModel,
+ [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+ FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
+ FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
+ FeatureHLE]>;
+
+// KNL
+// FIXME: define KNL model
+def : ProcessorModel<"knl", HaswellModel,
+ [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
+ FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+ FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+ FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+ FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
@@ -231,11 +277,16 @@ def : Proc<"amdfam10", [FeatureSSE4A,
// Bobcat
def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
FeatureLZCNT, FeaturePOPCNT]>;
+// Jaguar
+def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL, FeatureBMI,
+ FeatureF16C, FeatureMOVBE, FeatureLZCNT,
+ FeaturePOPCNT]>;
// Bulldozer
def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
FeatureAES, FeaturePCLMUL,
FeatureLZCNT, FeaturePOPCNT]>;
-// Enhanced Bulldozer
+// Piledriver
def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
FeatureAES, FeaturePCLMUL,
FeatureF16C, FeatureLZCNT,
@@ -279,6 +330,9 @@ def ATTAsmParser : AsmParser {
def ATTAsmParserVariant : AsmParserVariant {
int Variant = 0;
+ // Variant name.
+ string Name = "att";
+
// Discard comments in assembly strings.
string CommentDelimiter = "#";
@@ -289,6 +343,9 @@ def ATTAsmParserVariant : AsmParserVariant {
def IntelAsmParserVariant : AsmParserVariant {
int Variant = 1;
+ // Variant name.
+ string Name = "intel";
+
// Discard comments in assembly strings.
string CommentDelimiter = ";";
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index ac5daec..9e0ab82 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -201,7 +201,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
case X86II::MO_TLVP_PIC_BASE:
O << "@TLVP" << '-' << *MF->getPICBaseSymbol();
break;
- case X86II::MO_SECREL: O << "@SECREL"; break;
+ case X86II::MO_SECREL: O << "@SECREL32"; break;
}
}
@@ -702,48 +702,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
}
}
-MachineLocation
-X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
- MachineLocation Location;
- assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
- // Frame address. Currently handles register +- offset only.
-
- if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
- Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
- else {
- DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
- }
- return Location;
-}
-
-void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
- raw_ostream &O) {
- // Only the target-dependent form of DBG_VALUE should get here.
- // Referencing the offset and metadata as NOps-2 and NOps-1 is
- // probably portable to other targets; frame pointer location is not.
- unsigned NOps = MI->getNumOperands();
- assert(NOps==7);
- O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
- // cast away const; DIetc do not take const operands for some reason.
- DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
- if (V.getContext().isSubprogram())
- O << DISubprogram(V.getContext()).getDisplayName() << ":";
- O << V.getName();
- O << " <- ";
- // Frame address. Currently handles register +- offset only.
- O << '[';
- if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
- printOperand(MI, 0, O);
- else
- O << "undef";
- O << '+'; printOperand(MI, 3, O);
- O << ']';
- O << "+";
- printOperand(MI, NOps-2, O);
-}
-
-
-
//===----------------------------------------------------------------------===//
// Target Registry Stuff
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index bc7496b..6eed5ce 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -67,11 +67,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
unsigned AsmVariant = 1);
virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
-
- void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
- virtual MachineLocation
- getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index b516be0..38e2591 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX-512 target feature.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
// MMX vector types are always returned in MM0. If the target doesn't have
// MM0, it doesn't support these vector types.
CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
@@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
// i32, i64 in the standard way
CCDelegateTo<RetCC_X86Common>
]>;
@@ -156,6 +166,11 @@ def RetCC_X86_32 : CallingConv<[
def RetCC_X86_64 : CallingConv<[
// HiPE uses RetCC_X86_64_HiPE
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+ // Handle explicit CC selection
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
@@ -208,10 +223,15 @@ def CC_X86_64_C : CallingConv<[
// fixed arguments to vararg functions are supposed to be passed in
// registers. Actually modeling that would be a lot of work, though.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasAVX()",
+ CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
YMM4, YMM5, YMM6, YMM7]>>>>,
+ // The first 8 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
// Integer/FP values get stored in stack slots that are 8 bytes in size and
// 8-byte aligned if there are no more registers to hold them.
CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
@@ -225,7 +245,11 @@ def CC_X86_64_C : CallingConv<[
// 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCAssignToStack<32, 32>>
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
]>;
// Calling convention used on Win64
@@ -246,6 +270,9 @@ def CC_X86_Win64_C : CallingConv<[
// 256 bit vectors are passed by pointer
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+ // 512 bit vectors are passed by pointer
+ CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
// The first 4 MMX vector arguments are passed in GPRs.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
@@ -340,7 +367,7 @@ def CC_X86_32_Common : CallingConv<[
// The first 4 AVX 256-bit vector arguments are passed in YMM registers.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasAVX()",
+ CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
// Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
@@ -387,8 +414,8 @@ def CC_X86_32_ThisCall : CallingConv<[
// Promote i8/i16 arguments to i32.
CCIfType<[i8, i16], CCPromoteToType<i32>>,
- // Pass sret arguments indirectly through EAX
- CCIfSRet<CCAssignToReg<[EAX]>>,
+ // Pass sret arguments indirectly through stack.
+ CCIfSRet<CCAssignToStack<4, 4>>,
// The first integer argument is passed in ECX
CCIfType<[i32], CCAssignToReg<[ECX]>>,
@@ -464,6 +491,10 @@ def CC_Intel_OCL_BI : CallingConv<[
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
CCDelegateTo<CC_X86_32_C>
@@ -489,6 +520,8 @@ def CC_X86_32 : CallingConv<[
def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -528,6 +561,10 @@ def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
R13, R14, R15,
(sequence "YMM%u", 6, 15))>;
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+ R12, R13, R14, R15,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
//Standard C + XMM 8-15
def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
(sequence "XMM%u", 8, 15))>;
@@ -535,3 +572,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
//Standard C + YMM 8-15
def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
(sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 2518e02..5d72b44 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,13 +53,8 @@ namespace {
static char ID;
explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
: MachineFunctionPass(ID), II(0), TD(0), TM(tm),
- MCE(mce), PICBaseOffset(0), Is64BitMode(false),
- IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
- Emitter(X86TargetMachine &tm, CodeEmitter &mce,
- const X86InstrInfo &ii, const DataLayout &td, bool is64)
- : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
- MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
- IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+ MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
bool runOnMachineFunction(MachineFunction &MF);
@@ -1270,7 +1265,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
: (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
- if (Opcode == X86::MOV64ri64i32)
+ if (Opcode == X86::MOV32ri64)
rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
// This should not occur on Darwin for relocatable objects.
if (Opcode == X86::MOV64ri)
@@ -1451,6 +1446,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
MCE.emitByte(BaseOpcode);
MCE.emitByte(0xC9);
break;
+ case X86II::MRM_CA:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xCA);
+ break;
+ case X86II::MRM_CB:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xCB);
+ break;
case X86II::MRM_E8:
MCE.emitByte(BaseOpcode);
MCE.emitByte(0xE8);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 85155f5..5bc3420 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -45,10 +45,6 @@ class X86FastISel : public FastISel {
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- /// RegInfo - X86 register info.
- ///
- const X86RegisterInfo *RegInfo;
-
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
/// floating point ops.
/// When SSE is available, use it for f32 operations.
@@ -63,17 +59,16 @@ public:
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
- RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
}
virtual bool TargetSelectInstruction(const Instruction *I);
- /// TryToFoldLoad - The specified machine instr operand is a vreg, and that
+ /// \brief The specified machine instr operand is a vreg, and that
/// vreg is being provided by the specified load instruction. If possible,
/// try to fold the load as an operand to the instruction, returning true if
/// possible.
- virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
- const LoadInst *LI);
+ virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI);
virtual bool FastLowerArguments();
@@ -84,8 +79,10 @@ private:
bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
- bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
- bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
+ bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
+ bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
+ bool Aligned = false);
bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
unsigned &ResultReg);
@@ -107,6 +104,8 @@ private:
bool X86SelectShift(const Instruction *I);
+ bool X86SelectDivRem(const Instruction *I);
+
bool X86SelectSelect(const Instruction *I);
bool X86SelectTrunc(const Instruction *I);
@@ -236,7 +235,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
/// and a displacement offset, or a GlobalAddress,
/// i.e. V. Return true if it is possible.
bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
+X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
+ const X86AddressMode &AM, bool Aligned) {
// Get opcode and regclass of the output for the given store instruction.
unsigned Opc = 0;
switch (VT.getSimpleVT().SimpleTy) {
@@ -246,8 +246,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
// Mask out all but lowest bit.
unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
- TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
- Val = AndResult;
+ TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+ ValReg = AndResult;
}
// FALLTHROUGH, handling i1 as i8.
case MVT::i8: Opc = X86::MOV8mr; break;
@@ -263,26 +263,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
(Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
break;
case MVT::v4f32:
- Opc = X86::MOVAPSmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
- Opc = X86::MOVAPDmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
- Opc = X86::MOVDQAmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
}
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DL, TII.get(Opc)), AM).addReg(Val);
+ DL, TII.get(Opc)), AM).addReg(ValReg);
return true;
}
bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
- const X86AddressMode &AM) {
+ const X86AddressMode &AM, bool Aligned) {
// Handle 'null' like i32/i64 0.
if (isa<ConstantPointerNull>(Val))
Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
@@ -317,7 +326,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
if (ValReg == 0)
return false;
- return X86FastEmitStore(VT, ValReg, AM);
+ return X86FastEmitStore(VT, ValReg, AM, Aligned);
}
/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -693,8 +702,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
unsigned SABIAlignment =
TD.getABITypeAlignment(S->getValueOperand()->getType());
- if (S->getAlignment() != 0 && S->getAlignment() < SABIAlignment)
- return false;
+ bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
MVT VT;
if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
@@ -704,7 +712,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
if (!X86SelectAddress(I->getOperand(1), AM))
return false;
- return X86FastEmitStore(VT, I->getOperand(0), AM);
+ return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
}
/// X86SelectRet - Select and emit code to implement ret instructions.
@@ -720,10 +728,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
- CC != CallingConv::X86_FastCall)
+ CC != CallingConv::X86_FastCall &&
+ CC != CallingConv::X86_64_SysV)
return false;
- if (Subtarget->isTargetWin64())
+ if (Subtarget->isCallingConvWin64(CC))
return false;
// Don't handle popping bytes on return for now.
@@ -816,14 +825,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// The x86-64 ABI for returning structs by value requires that we copy
// the sret argument into %rax for the return. We saved the argument into
// a virtual register in the entry block, so now we copy the value out
- // and into %rax.
- if (Subtarget->is64Bit() && F.hasStructRetAttr()) {
+ // and into %rax. We also do the same with %eax for Win32.
+ if (F.hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
unsigned Reg = X86MFInfo->getSRetReturnReg();
assert(Reg &&
"SRetReturnReg should have been set in LowerFormalArguments()!");
+ unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
- X86::RAX).addReg(Reg);
- RetRegs.push_back(X86::RAX);
+ RetReg).addReg(Reg);
+ RetRegs.push_back(RetReg);
}
// Now emit the RET.
@@ -1006,10 +1017,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
}
bool X86FastISel::X86SelectZExt(const Instruction *I) {
- // Handle zero-extension from i1 to i8, which is common.
- if (!I->getOperand(0)->getType()->isIntegerTy(1))
- return false;
-
EVT DstVT = TLI.getValueType(I->getType());
if (!TLI.isTypeLegal(DstVT))
return false;
@@ -1018,12 +1025,37 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
if (ResultReg == 0)
return false;
- // Set the high bits to zero.
- ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
- if (ResultReg == 0)
- return false;
+ // Handle zero-extension from i1 to i8, which is common.
+ MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()).getSimpleVT();
+ if (SrcVT.SimpleTy == MVT::i1) {
+ // Set the high bits to zero.
+ ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ SrcVT = MVT::i8;
+
+ if (ResultReg == 0)
+ return false;
+ }
+
+ if (DstVT == MVT::i64) {
+ // Handle extension to 64-bits via sub-register shenanigans.
+ unsigned MovInst;
+
+ switch (SrcVT.SimpleTy) {
+ case MVT::i8: MovInst = X86::MOVZX32rr8; break;
+ case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+ case MVT::i32: MovInst = X86::MOV32rr; break;
+ default: llvm_unreachable("Unexpected zext to i64 source type");
+ }
+
+ unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32)
+ .addReg(ResultReg);
- if (DstVT != MVT::i8) {
+ ResultReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG),
+ ResultReg)
+ .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+ } else if (DstVT != MVT::i8) {
ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
ResultReg, /*Kill=*/true);
if (ResultReg == 0)
@@ -1233,6 +1265,170 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
return true;
}
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+ const static unsigned NumTypes = 4; // i8, i16, i32, i64
+ const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
+ const static bool S = true; // IsSigned
+ const static bool U = false; // !IsSigned
+ const static unsigned Copy = TargetOpcode::COPY;
+ // For the X86 DIV/IDIV instruction, in most cases the dividend
+ // (numerator) must be in a specific register pair highreg:lowreg,
+ // producing the quotient in lowreg and the remainder in highreg.
+ // For most data types, to set up the instruction, the dividend is
+ // copied into lowreg, and lowreg is sign-extended or zero-extended
+ // into highreg. The exception is i8, where the dividend is defined
+ // as a single register rather than a register pair, and we
+ // therefore directly sign-extend or zero-extend the dividend into
+ // lowreg, instead of copying, and ignore the highreg.
+ const static struct DivRemEntry {
+ // The following portion depends only on the data type.
+ const TargetRegisterClass *RC;
+ unsigned LowInReg; // low part of the register pair
+ unsigned HighInReg; // high part of the register pair
+ // The following portion depends on both the data type and the operation.
+ struct DivRemResult {
+ unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
+ unsigned OpSignExtend; // Opcode for sign-extending lowreg into
+ // highreg, or copying a zero into highreg.
+ unsigned OpCopy; // Opcode for copying dividend into lowreg, or
+ // zero/sign-extending into lowreg for i8.
+ unsigned DivRemResultReg; // Register containing the desired result.
+ bool IsOpSigned; // Whether to use signed or unsigned form.
+ } ResultTable[NumOps];
+ } OpTable[NumTypes] = {
+ { &X86::GR8RegClass, X86::AX, 0, {
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
+ { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
+ { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
+ }
+ }, // i8
+ { &X86::GR16RegClass, X86::AX, X86::DX, {
+ { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
+ { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
+ { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
+ }
+ }, // i16
+ { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
+ { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
+ { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
+ }
+ }, // i32
+ { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+ { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
+ { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
+ { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
+ }
+ }, // i64
+ };
+
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned TypeIndex, OpIndex;
+ switch (VT.SimpleTy) {
+ default: return false;
+ case MVT::i8: TypeIndex = 0; break;
+ case MVT::i16: TypeIndex = 1; break;
+ case MVT::i32: TypeIndex = 2; break;
+ case MVT::i64: TypeIndex = 3;
+ if (!Subtarget->is64Bit())
+ return false;
+ break;
+ }
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected div/rem opcode");
+ case Instruction::SDiv: OpIndex = 0; break;
+ case Instruction::SRem: OpIndex = 1; break;
+ case Instruction::UDiv: OpIndex = 2; break;
+ case Instruction::URem: OpIndex = 3; break;
+ }
+
+ const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+ const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0)
+ return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0)
+ return false;
+
+ // Move op0 into low-order input register.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+ // Zero-extend or sign-extend into high-order input register.
+ if (OpEntry.OpSignExtend) {
+ if (OpEntry.IsOpSigned)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpSignExtend));
+ else {
+ unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(X86::MOV32r0), Zero32);
+
+ // Copy the zero into the appropriate sub/super/identical physical
+ // register. Unfortunately the operations needed are not uniform enough to
+ // fit neatly into the table above.
+ if (VT.SimpleTy == MVT::i16) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32, 0, X86::sub_16bit);
+ } else if (VT.SimpleTy == MVT::i32) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Copy), TypeEntry.HighInReg)
+ .addReg(Zero32);
+ } else if (VT.SimpleTy == MVT::i64) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+ .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+ }
+ }
+ }
+ // Generate the DIV/IDIV instruction.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+ // For i8 remainder, we can't reference AH directly, as we'll end
+ // up with bogus copies like %R9B = COPY %AH. Reference AX
+ // instead to prevent AH references in a REX instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ unsigned ResultReg = 0;
+ if ((I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+ unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri),
+ ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+ /*Kill=*/true, X86::sub_8bit);
+ }
+ // Copy the result out of the physreg if we haven't already.
+ if (!ResultReg) {
+ ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
+ UpdateValueMap(I, ResultReg);
+
+ return true;
+}
+
bool X86FastISel::X86SelectSelect(const Instruction *I) {
MVT VT;
if (!isTypeLegal(I->getType(), VT))
@@ -1526,9 +1722,6 @@ bool X86FastISel::FastLowerArguments() {
if (!FuncInfo.CanLowerReturn)
return false;
- if (Subtarget->isTargetWindows())
- return false;
-
const Function *F = FuncInfo.Fn;
if (F->isVarArg())
return false;
@@ -1536,7 +1729,10 @@ bool X86FastISel::FastLowerArguments() {
CallingConv::ID CC = F->getCallingConv();
if (CC != CallingConv::C)
return false;
-
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
if (!Subtarget->is64Bit())
return false;
@@ -1580,8 +1776,6 @@ bool X86FastISel::FastLowerArguments() {
const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
I != E; ++I, ++Idx) {
- if (I->use_empty())
- continue;
bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
@@ -1640,8 +1834,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Handle only C and fastcc calling conventions for now.
ImmutableCallSite CS(CI);
CallingConv::ID CC = CS.getCallingConv();
+ bool isWin64 = Subtarget->isCallingConvWin64(CC);
if (CC != CallingConv::C && CC != CallingConv::Fast &&
- CC != CallingConv::X86_FastCall)
+ CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 &&
+ CC != CallingConv::X86_64_SysV)
return false;
// fastcc with -tailcallopt is intended to provide a guaranteed
@@ -1655,7 +1851,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Don't know how to handle Win64 varargs yet. Nothing special needed for
// x86-32. Special handling for x86-64 is implemented.
- if (isVarArg && Subtarget->isTargetWin64())
+ if (isVarArg && isWin64)
return false;
// Fast-isel doesn't know about callee-pop yet.
@@ -1785,7 +1981,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
I->getParent()->getContext());
// Allocate shadow area for Win64
- if (Subtarget->isTargetWin64())
+ if (isWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
@@ -1868,6 +2064,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
} else {
unsigned LocMemOffset = VA.getLocMemOffset();
X86AddressMode AM;
+ const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(
+ getTargetMachine()->getRegisterInfo());
AM.Base.Reg = RegInfo->getStackRegister();
AM.Disp = LocMemOffset;
const Value *ArgVal = ArgVals[VA.getValNo()];
@@ -1899,7 +2097,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
X86::EBX).addReg(Base);
}
- if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+ if (Subtarget->is64Bit() && isVarArg && !isWin64) {
// Count the number of XMM registers allocated.
static const uint16_t XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -1968,7 +2166,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (Subtarget->isPICStyleGOT())
MIB.addReg(X86::EBX, RegState::Implicit);
- if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+ if (Subtarget->is64Bit() && isVarArg && !isWin64)
MIB.addReg(X86::AL, RegState::Implicit);
// Add implicit physical register uses to the call.
@@ -2082,6 +2280,11 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) {
case Instruction::AShr:
case Instruction::Shl:
return X86SelectShift(I);
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return X86SelectDivRem(I);
case Instruction::Select:
return X86SelectSelect(I);
case Instruction::Trunc:
@@ -2273,12 +2476,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
}
-/// TryToFoldLoad - The specified machine instr operand is a vreg, and that
-/// vreg is being provided by the specified load instruction. If possible,
-/// try to fold the load as an operand to the instruction, returning true if
-/// possible.
-bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
- const LoadInst *LI) {
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+ const LoadInst *LI) {
X86AddressMode AM;
if (!X86SelectAddress(LI->getOperand(0), AM))
return false;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 0000000..d21cb8a
--- /dev/null
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,253 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will find instructions which
+// can be re-written as LEA instructions in order to reduce pipeline
+// delays for some models of the Intel Atom family.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+ class FixupLEAPass : public MachineFunctionPass {
+ enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+ static char ID;
+ /// \brief Loop over all of the instructions in the basic block
+ /// replacing applicable instructions with LEA instructions,
+ /// where appropriate.
+ bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+ virtual const char *getPassName() const { return "X86 Atom LEA Fixup";}
+
+ /// \brief Given a machine register, look for the instruction
+ /// which writes it in the current basic block. If found,
+ /// try to replace it with an equivalent LEA instruction.
+ /// If replacement succeeds, then also process the the newly created
+ /// instruction.
+ void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Given a memory access or LEA instruction
+ /// whose address mode uses a base and/or index register, look for
+ /// an opportunity to replace the instruction which sets the base or index
+ /// register with an equivalent LEA instruction.
+ void processInstruction(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief Determine if an instruction references a machine register
+ /// and, if so, whether it reads or writes the register.
+ RegUsageState usesRegister(MachineOperand& p,
+ MachineBasicBlock::iterator I);
+
+ /// \brief Step backwards through a basic block, looking
+ /// for an instruction which writes a register within
+ /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+ MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI);
+
+ /// \brief if an instruction can be converted to an
+ /// equivalent LEA, insert the new instruction into the basic block
+ /// and return a pointer to it. Otherwise, return zero.
+ MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const;
+
+ public:
+ FixupLEAPass() : MachineFunctionPass(ID) {}
+
+ /// \brief Loop over all of the basic blocks,
+ /// replacing instructions by equivalent LEA instructions
+ /// if needed and when possible.
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ private:
+ MachineFunction *MF;
+ const TargetMachine *TM;
+ const TargetInstrInfo *TII; // Machine instruction info.
+
+ };
+ char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI) const {
+ MachineInstr* MI = MBBI;
+ MachineInstr* NewMI;
+ switch (MI->getOpcode()) {
+ case X86::MOV32rr:
+ case X86::MOV64rr: {
+ const MachineOperand& Src = MI->getOperand(1);
+ const MachineOperand& Dest = MI->getOperand(0);
+ NewMI = BuildMI(*MF, MI->getDebugLoc(),
+ TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
+ .addOperand(Dest)
+ .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8_DB:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri_DB:
+ case X86::ADD32ri8_DB:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri_DB:
+ case X86::ADD16ri8_DB:
+ if (!MI->getOperand(2).isImm()) {
+ // convertToThreeAddress will call getImm()
+ // which requires isImm() to be true
+ return 0;
+ }
+ }
+ return TII->convertToThreeAddress(MFI, MBBI, 0);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() {
+ return new FixupLEAPass();
+}
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
+ TM = &MF->getTarget();
+ TII = TM->getInstrInfo();
+
+ DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+ // Process all basic blocks.
+ for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+ processBasicBlock(Func, I);
+ DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+ return true;
+}
+
+FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
+ MachineBasicBlock::iterator I) {
+ RegUsageState RegUsage = RU_NotUsed;
+ MachineInstr* MI = I;
+
+ for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+ MachineOperand& opnd = MI->getOperand(i);
+ if (opnd.isReg() && opnd.getReg() == p.getReg()){
+ if (opnd.isDef())
+ return RU_Write;
+ RegUsage = RU_Read;
+ }
+ }
+ return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ if (I == MFI->begin()) {
+ if (MFI->isPredecessor(MFI)) {
+ I = --MFI->end();
+ return true;
+ }
+ else
+ return false;
+ }
+ --I;
+ return true;
+}
+
+MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ int InstrDistance = 1;
+ MachineBasicBlock::iterator CurInst;
+ static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+ CurInst = I;
+ bool Found;
+ Found = getPreviousInstr(CurInst, MFI);
+ while( Found && I != CurInst) {
+ if (CurInst->isCall() || CurInst->isInlineAsm())
+ break;
+ if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+ break; // too far back to make a difference
+ if (usesRegister(p, CurInst) == RU_Write){
+ return CurInst;
+ }
+ InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
+ Found = getPreviousInstr(CurInst, MFI);
+ }
+ return 0;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ // Process a load, store, or LEA instruction.
+ MachineInstr *MI = I;
+ int opcode = MI->getOpcode();
+ const MCInstrDesc& Desc = MI->getDesc();
+ int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+ if (AddrOffset >= 0) {
+ AddrOffset += X86II::getOperandBias(Desc);
+ MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+ if (p.isReg() && p.getReg() != X86::ESP) {
+ seekLEAFixup(p, I, MFI);
+ }
+ MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ if (q.isReg() && q.getReg() != X86::ESP) {
+ seekLEAFixup(q, I, MFI);
+ }
+ }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand& p,
+ MachineBasicBlock::iterator& I,
+ MachineFunction::iterator MFI) {
+ MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+ if (MBI) {
+ MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
+ if (NewMI) {
+ ++NumLEAs;
+ DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
+ // now to replace with an equivalent LEA...
+ DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
+ MFI->erase(MBI);
+ MachineBasicBlock::iterator J =
+ static_cast<MachineBasicBlock::iterator> (NewMI);
+ processInstruction(J, MFI);
+ }
+ }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+ MachineFunction::iterator MFI) {
+
+ for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
+ processInstruction(I, MFI);
+ return false;
+}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 0585b43..48470da 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -115,9 +115,10 @@ namespace {
unsigned Mask = 0;
for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
E = MBB->livein_end(); I != E; ++I) {
- unsigned Reg = *I - X86::FP0;
- if (Reg < 8)
- Mask |= 1 << Reg;
+ unsigned Reg = *I;
+ if (Reg < X86::FP0 || Reg > X86::FP6)
+ continue;
+ Mask |= 1 << (Reg - X86::FP0);
}
return Mask;
}
@@ -893,8 +894,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Produce implicit-defs for free by using killed registers.
while (Kills && Defs) {
- unsigned KReg = CountTrailingZeros_32(Kills);
- unsigned DReg = CountTrailingZeros_32(Defs);
+ unsigned KReg = countTrailingZeros(Kills);
+ unsigned DReg = countTrailingZeros(Defs);
DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
std::swap(RegMap[KReg], RegMap[DReg]);
@@ -917,7 +918,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Manually kill the rest.
while (Kills) {
- unsigned KReg = CountTrailingZeros_32(Kills);
+ unsigned KReg = countTrailingZeros(Kills);
DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
freeStackSlotBefore(I, KReg);
Kills &= ~(1 << KReg);
@@ -925,7 +926,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Load zeros for all the imp-defs.
while(Defs) {
- unsigned DReg = CountTrailingZeros_32(Defs);
+ unsigned DReg = countTrailingZeros(Defs);
DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
pushReg(DReg);
@@ -1636,7 +1637,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
// Note: this might be a non-optimal pop sequence. We might be able to do
// better by trying to pop in stack order or something.
while (FPKills) {
- unsigned FPReg = CountTrailingZeros_32(FPKills);
+ unsigned FPReg = countTrailingZeros(FPKills);
if (isLive(FPReg))
freeStackSlotAfter(InsertPt, FPReg);
FPKills &= ~(1U << FPReg);
@@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
.addExternalSymbol("_ftol2")
.addReg(X86::ST0, RegState::ImplicitKill)
+ .addReg(X86::ECX, RegState::ImplicitDefine)
.addReg(X86::EAX, RegState::Define | RegState::Implicit)
.addReg(X86::EDX, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 54cbd40..b994e67 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -307,12 +307,12 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
unsigned FramePtr) const {
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
// Add callee saved registers to move list.
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
if (CSI.empty()) return;
- std::vector<MachineMove> &Moves = MMI.getFrameMoves();
const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
bool HasFP = hasFP(MF);
@@ -360,16 +360,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
if (HasFP && FramePtr == Reg)
continue;
- MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
- MachineLocation CSSrc(Reg);
- Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+ MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset));
}
}
/// getCompactUnwindRegNum - Get the compact unwind number for a given
/// register. The number corresponds to the enum lists in
/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
+ static const uint16_t CU32BitRegs[] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const uint16_t CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
if (*CURegs == Reg)
return Idx;
@@ -398,16 +404,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
// 4 3
// 5 3
//
- static const uint16_t CU32BitRegs[] = {
- X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
- };
- static const uint16_t CU64BitRegs[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
- const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
- int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
if (CUReg == -1) return ~0U;
SavedRegs[i] = CUReg;
}
@@ -466,14 +464,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
static uint32_t
encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
bool Is64Bit) {
- static const uint16_t CU32BitRegs[] = {
- X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
- };
- static const uint16_t CU64BitRegs[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
- const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
// Encode the registers in the order they were saved, 3-bits per register. The
// registers are numbered from 1 to CU_NUM_SAVED_REGS.
uint32_t RegEnc = 0;
@@ -481,7 +471,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
unsigned Reg = SavedRegs[I];
if (Reg == 0) continue;
- int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+ int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
if (CURegNum == -1) return ~0U;
// Encode the 3-bit register number in order, skipping over 3-bits for each
@@ -528,11 +518,17 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
if (!MI.getFlag(MachineInstr::FrameSetup)) break;
// We don't exect any more prolog instructions.
- if (ExpectEnd) return 0;
+ if (ExpectEnd) return CU::UNWIND_MODE_DWARF;
if (Opc == PushInstr) {
// If there are too many saved registers, we cannot use compact encoding.
- if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0;
+ if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
+
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
+ ExpectEnd = true;
+ continue;
+ }
SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
StackAdjust += OffsetSize;
@@ -542,7 +538,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
unsigned DstReg = MI.getOperand(0).getReg();
if (DstReg != FramePtr || SrcReg != StackPtr)
- return 0;
+ return CU::UNWIND_MODE_DWARF;
StackAdjust = 0;
memset(SavedRegs, 0, sizeof(SavedRegs));
@@ -552,7 +548,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
if (StackSize)
// We already have a stack size.
- return 0;
+ return CU::UNWIND_MODE_DWARF;
if (!MI.getOperand(0).isReg() ||
MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
@@ -560,7 +556,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
// We need this to be a stack adjustment pointer. Something like:
//
// %RSP<def> = SUB64ri8 %RSP, 48
- return 0;
+ return CU::UNWIND_MODE_DWARF;
StackSize = MI.getOperand(2).getImm() / StackDivide;
SubtractInstrIdx += InstrOffset;
@@ -574,31 +570,31 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
if (HasFP) {
if ((StackAdjust & 0xFF) != StackAdjust)
// Offset was too big for compact encoding.
- return 0;
+ return CU::UNWIND_MODE_DWARF;
// Get the encoding of the saved registers when we have a frame pointer.
uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
- if (RegEnc == ~0U) return 0;
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
- CompactUnwindEncoding |= 0x01000000;
+ CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
- CompactUnwindEncoding |= RegEnc & 0x7FFF;
+ CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
} else {
++StackAdjust;
uint32_t TotalStackSize = StackAdjust + StackSize;
if ((TotalStackSize & 0xFF) == TotalStackSize) {
// Frameless stack with a small stack size.
- CompactUnwindEncoding |= 0x02000000;
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
// Encode the stack size.
CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
} else {
if ((StackAdjust & 0x7) != StackAdjust)
// The extra stack adjustments are too big for us to handle.
- return 0;
+ return CU::UNWIND_MODE_DWARF;
// Frameless stack with an offset too large for us to encode compactly.
- CompactUnwindEncoding |= 0x03000000;
+ CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
// Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
// instruction.
@@ -616,10 +612,11 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
uint32_t RegEnc =
encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
Is64Bit);
- if (RegEnc == ~0U) return 0;
+ if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
// Encode the register encoding.
- CompactUnwindEncoding |= RegEnc & 0x3FF;
+ CompactUnwindEncoding |=
+ RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
}
return CompactUnwindEncoding;
@@ -734,7 +731,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// REG < 64 => DW_CFA_offset + Reg
// ELSE => DW_CFA_offset_extended
- std::vector<MachineMove> &Moves = MMI.getFrameMoves();
uint64_t NumBytes = 0;
int stackGrowth = -SlotSize;
@@ -767,20 +763,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addSym(FrameLabel);
// Define the current CFA rule to use the provided offset.
- if (StackSize) {
- MachineLocation SPDst(MachineLocation::VirtualFP);
- MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
- Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
- } else {
- MachineLocation SPDst(StackPtr);
- MachineLocation SPSrc(StackPtr, stackGrowth);
- Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
- }
+ assert(StackSize);
+ MMI.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth));
// Change the rule for the FramePtr to be an "offset" rule.
- MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
- MachineLocation FPSrc(FramePtr);
- Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+ MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr,
+ 2 * stackGrowth));
}
// Update EBP with the new base value.
@@ -796,9 +786,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addSym(FrameLabel);
// Define the current CFA to use the EBP/RBP register.
- MachineLocation FPDst(FramePtr);
- MachineLocation FPSrc(MachineLocation::VirtualFP);
- Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+ MMI.addFrameInst(
+ MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr));
}
// Mark the FramePtr as live-in in every block except the entry.
@@ -826,10 +816,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
// Define the current CFA rule to use the provided offset.
- unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
- MachineLocation SPDst(Ptr);
- MachineLocation SPSrc(Ptr, StackOffset);
- Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ assert(StackSize);
+ MMI.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(Label, StackOffset));
StackOffset += stackGrowth;
}
}
@@ -925,11 +914,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
- // MSVC x64's __chkstk needs to adjust %rsp.
- // FIXME: %rax preserves the offset and should be available.
- if (isSPUpdateNeeded)
- emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
- UseLEA, TII, *RegInfo);
+ // MSVC x64's __chkstk does not adjust %rsp itself.
+ // It also does not clobber %rax so we can reuse it when adjusting %rsp.
+ if (isSPUpdateNeeded) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
+ .addReg(StackPtr)
+ .addReg(X86::RAX)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
if (isEAXAlive) {
// Restore EAX
@@ -963,16 +955,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
if (!HasFP && NumBytes) {
// Define the current CFA rule to use the provided offset.
- if (StackSize) {
- MachineLocation SPDst(MachineLocation::VirtualFP);
- MachineLocation SPSrc(MachineLocation::VirtualFP,
- -StackSize + stackGrowth);
- Moves.push_back(MachineMove(Label, SPDst, SPSrc));
- } else {
- MachineLocation SPDst(StackPtr);
- MachineLocation SPSrc(StackPtr, stackGrowth);
- Moves.push_back(MachineMove(Label, SPDst, SPSrc));
- }
+ assert(StackSize);
+ MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+ Label, -StackSize + stackGrowth));
}
// Emit DWARF info specifying the offsets of the callee-saved registers.
@@ -1338,7 +1323,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
unsigned SlotSize = RegInfo->getSlotSize();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
if (TailCallReturnAddrDelta < 0) {
// create RETURNADDR area
@@ -1351,7 +1336,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
// }
// [EBP]
MFI->CreateFixedObject(-TailCallReturnAddrDelta,
- (-1U*SlotSize)+TailCallReturnAddrDelta, true);
+ TailCallReturnAddrDelta - SlotSize, true);
}
if (hasFP(MF)) {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 3f08b9a..6e309d8 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -19,8 +19,35 @@
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
- class MCSymbol;
- class X86TargetMachine;
+
+namespace CU {
+
+ /// Compact unwind encoding values.
+ enum CompactUnwindEncodings {
+ /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+ /// the return address, then [RE]SP is moved to [RE]BP.
+ UNWIND_MODE_BP_FRAME = 0x01000000,
+
+ /// A frameless function with a small constant stack size.
+ UNWIND_MODE_STACK_IMMD = 0x02000000,
+
+ /// A frameless function with a large constant stack size.
+ UNWIND_MODE_STACK_IND = 0x03000000,
+
+ /// No compact unwind encoding is available.
+ UNWIND_MODE_DWARF = 0x04000000,
+
+ /// Mask for encoding the frame registers.
+ UNWIND_BP_FRAME_REGISTERS = 0x00007FFF,
+
+ /// Mask for encoding the frameless registers.
+ UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+ };
+
+} // end CU namespace
+
+class MCSymbol;
+class X86TargetMachine;
class X86FrameLowering : public TargetFrameLowering {
const X86TargetMachine &TM;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 00fbe69..9465420 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -141,10 +141,6 @@ namespace {
/// SelectionDAG operations.
///
class X86DAGToDAGISel : public SelectionDAGISel {
- /// X86Lowering - This object fully describes how to lower LLVM code to an
- /// X86-specific SelectionDAG.
- const X86TargetLowering &X86Lowering;
-
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
@@ -156,7 +152,6 @@ namespace {
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel),
- X86Lowering(*tm.getTargetLowering()),
Subtarget(&tm.getSubtarget<X86Subtarget>()),
OptForSize(false) {}
@@ -200,9 +195,13 @@ namespace {
bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
bool SelectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
@@ -229,14 +228,15 @@ namespace {
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
- CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+ CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
+ getTargetLowering()->getPointerTy()) :
AM.Base_Reg;
Scale = getI8Imm(AM.Scale);
Index = AM.IndexReg;
// These are 32-bit even in 64-bit mode since RIP relative offset
// is 32-bit.
if (AM.GV)
- Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
MVT::i32, AM.Disp,
AM.SymbolFlags);
else if (AM.CP)
@@ -373,7 +373,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
else
Ops.push_back(Chain.getOperand(i));
SDValue NewChain =
- CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
MVT::Other, &Ops[0], Ops.size());
Ops.clear();
Ops.push_back(NewChain);
@@ -444,7 +444,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
if (OptLevel != CodeGenOpt::None &&
- (N->getOpcode() == X86ISD::CALL ||
+ // Only does this when target favors doesn't favor register indirect
+ // call.
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
// Only does this if load can be folded into TC_RETURN.
(Subtarget->is64Bit() ||
@@ -498,8 +500,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// If the source and destination are SSE registers, then this is a legal
// conversion that should not be lowered.
- bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
- bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(getTargetLowering());
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
if (SrcIsSSE && DstIsSSE)
continue;
@@ -522,7 +526,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
MemVT = SrcIsSSE ? SrcVT : DstVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
@@ -780,7 +784,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
return true;
EVT VT = N.getValueType();
- DebugLoc DL = N.getDebugLoc();
+ SDLoc DL(N);
SDValue Eight = DAG.getConstant(8, MVT::i8);
SDValue NewMask = DAG.getConstant(0xff, VT);
SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
@@ -828,7 +832,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
return true;
EVT VT = N.getValueType();
- DebugLoc DL = N.getDebugLoc();
+ SDLoc DL(N);
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
@@ -884,8 +888,8 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
return true;
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
- unsigned MaskLZ = CountLeadingZeros_64(Mask);
- unsigned MaskTZ = CountTrailingZeros_64(Mask);
+ unsigned MaskLZ = countLeadingZeros(Mask);
+ unsigned MaskTZ = countTrailingZeros(Mask);
// The amount of shift we're trying to fit into the addressing mode is taken
// from the trailing zeros of the mask.
@@ -930,11 +934,11 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
if (ReplacingAnyExtend) {
assert(X.getValueType() != VT);
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
- SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
InsertDAGNode(DAG, N, NewX);
X = NewX;
}
- DebugLoc DL = N.getDebugLoc();
+ SDLoc DL(N);
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
@@ -958,7 +962,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
- DebugLoc dl = N.getDebugLoc();
+ SDLoc dl(N);
DEBUG({
dbgs() << "MatchAddress: ";
AM.dump();
@@ -1378,6 +1382,71 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
}
+bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+ if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CN->getZExtValue();
+ if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+ return false;
+
+ Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64);
+ return true;
+ }
+
+ // In static codegen with small code model, we can get the address of a label
+ // into a register with 'movl'. TableGen has already made sure we're looking
+ // at a label of some kind.
+ assert(N->getOpcode() == X86ISD::Wrapper &&
+ "Unexpected node type for MOV32ri64");
+ N = N.getOperand(0);
+
+ if (N->getOpcode() != ISD::TargetConstantPool &&
+ N->getOpcode() != ISD::TargetJumpTable &&
+ N->getOpcode() != ISD::TargetGlobalAddress &&
+ N->getOpcode() != ISD::TargetExternalSymbol &&
+ N->getOpcode() != ISD::TargetBlockAddress)
+ return false;
+
+ Imm = N;
+ return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+ return false;
+
+ SDLoc DL(N);
+ RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+ if (RN && RN->getReg() == 0)
+ Base = CurDAG->getRegister(0, MVT::i64);
+ else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) {
+ // Base could already be %rip, particularly in the x32 ABI.
+ Base = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, MVT::i64),
+ Base,
+ CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+ 0);
+ }
+
+ RN = dyn_cast<RegisterSDNode>(Index);
+ if (RN && RN->getReg() == 0)
+ Index = CurDAG->getRegister(0, MVT::i64);
+ else {
+ assert(Index.getValueType() == MVT::i32 &&
+ "Expect to be extending 32-bit registers for use in LEA");
+ Index = SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, MVT::i64),
+ Index,
+ CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+ 0);
+ }
+
+ return true;
+}
+
/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
/// mode it matches can be cost effectively emitted as an LEA instruction.
bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
@@ -1485,7 +1554,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
///
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
- return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+ return CurDAG->getRegister(GlobalBaseReg,
+ getTargetLowering()->getPointerTy()).getNode();
}
SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
@@ -1500,9 +1570,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
- SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
- MVT::i32, MVT::i32, MVT::Other, Ops,
- array_lengthof(Ops));
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
+ MVT::i32, MVT::i32, MVT::Other, Ops);
cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
return ResNode;
}
@@ -1636,7 +1705,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
// + empty, the operand is not needed any more with the new op selected.
// + non-empty, otherwise.
static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
- DebugLoc dl,
+ SDLoc dl,
enum AtomicOpc &Op, EVT NVT,
SDValue Val) {
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
@@ -1688,7 +1757,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
if (Node->hasAnyUseOfValue(0))
return 0;
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
// Optimize common patterns for __sync_or_and_fetch and similar arith
// operations where the result is not used. This allows us to use the "lock"
@@ -1718,7 +1787,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
Op = ADD;
break;
}
-
+
Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
bool isUnOp = !Val.getNode();
bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
@@ -1770,12 +1839,10 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
if (isUnOp) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
- Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
- array_lengthof(Ops)), 0);
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
} else {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
- Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
- array_lengthof(Ops)), 0);
+ Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
}
cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
SDValue RetVals[] = { Undef, Ret };
@@ -1921,7 +1988,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
if (ChainCheck)
// Make a new TokenFactor with all the other input chains except
// for the load.
- InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+ InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
MVT::Other, &ChainOps[0], ChainOps.size());
}
if (!ChainCheck)
@@ -1969,8 +2036,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
SDValue Segment = CurDAG->getRegister(0, MVT::i32);
const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
Disp, Segment, VMask, Chain};
- SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
- VTs, Ops, array_lengthof(Ops));
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops);
// Node has 2 outputs: VDst and MVT::Other.
// ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
// We replace VDst of Node with VDst of ResNode, and Other of Node with Other
@@ -1984,7 +2050,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
EVT NVT = Node->getValueType(0);
unsigned Opc, MOpc;
unsigned Opcode = Node->getOpcode();
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
@@ -2015,6 +2081,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_avx2_gather_d_d_256:
case Intrinsic::x86_avx2_gather_q_d:
case Intrinsic::x86_avx2_gather_q_d_256: {
+ if (!Subtarget->hasAVX2())
+ break;
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic");
@@ -2184,7 +2252,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
SDValue Ops[] = {N1, InFlag};
- SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
@@ -2265,16 +2333,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
InFlag };
if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
- array_lengthof(Ops));
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);
ResLo = SDValue(CNode, 1);
Chain = SDValue(CNode, 2);
InFlag = SDValue(CNode, 3);
} else {
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
- array_lengthof(Ops));
+ SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
Chain = SDValue(CNode, 0);
InFlag = SDValue(CNode, 1);
}
@@ -2285,15 +2351,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Ops[] = { N1, InFlag };
if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
- array_lengthof(Ops));
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
ResHi = SDValue(CNode, 0);
ResLo = SDValue(CNode, 1);
InFlag = SDValue(CNode, 2);
} else {
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
- array_lengthof(Ops));
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
InFlag = SDValue(CNode, 0);
}
}
@@ -2369,27 +2433,24 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
unsigned LoReg, HiReg, ClrReg;
- unsigned ClrOpcode, SExtOpcode;
+ unsigned SExtOpcode;
switch (NVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
- ClrOpcode = 0;
SExtOpcode = X86::CBW;
break;
case MVT::i16:
LoReg = X86::AX; HiReg = X86::DX;
- ClrOpcode = X86::MOV16r0; ClrReg = X86::DX;
+ ClrReg = X86::DX;
SExtOpcode = X86::CWD;
break;
case MVT::i32:
LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
- ClrOpcode = X86::MOV32r0;
SExtOpcode = X86::CDQ;
break;
case MVT::i64:
LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
- ClrOpcode = X86::MOV64r0;
SExtOpcode = X86::CQO;
break;
}
@@ -2407,8 +2468,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
Move =
SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
- MVT::Other, Ops,
- array_lengthof(Ops)), 0);
+ MVT::Other, Ops), 0);
Chain = Move.getValue(1);
ReplaceUses(N0.getValue(1), Chain);
} else {
@@ -2428,8 +2488,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
} else {
// Zero out the high part, effectively zero extending the input.
- SDValue ClrNode =
- SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+ SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+ switch (NVT.getSimpleVT().SimpleTy) {
+ case MVT::i16:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+ CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)),
+ 0);
+ break;
+ case MVT::i32:
+ break;
+ case MVT::i64:
+ ClrNode =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, MVT::i64), ClrNode,
+ CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+ 0);
+ break;
+ default:
+ llvm_unreachable("Unexpected division source");
+ }
+
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
ClrNode, InFlag).getValue(1);
}
@@ -2439,8 +2520,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
SDNode *CNode =
- CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops,
- array_lengthof(Ops));
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
InFlag = SDValue(CNode, 1);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
@@ -2451,6 +2531,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// Prevent use of AH in a REX instruction by referencing AX instead.
// Shift it down 8 bits.
+ //
+ // The current assumption of the register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
if (HiReg == X86::AH && Subtarget->is64Bit() &&
!SDValue(Node, 1).use_empty()) {
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
@@ -2671,9 +2756,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
EVT LdVT = LoadNode->getMemoryVT();
unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
- Node->getDebugLoc(),
- MVT::i32, MVT::Other, Ops,
- array_lengthof(Ops));
+ SDLoc(Node),
+ MVT::i32, MVT::Other, Ops);
Result->setMemRefs(MemOp, MemOp + 2);
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e6858bc..9ffe29f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -55,20 +55,17 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
// Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
-/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
-/// sets things up to match to an AVX VEXTRACTF128 instruction or a
-/// simple subregister reference. Idx is an index in the 128 bits we
-/// want. It need not be aligned to a 128-bit bounday. That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, DebugLoc dl) {
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
EVT VT = Vec.getValueType();
- assert(VT.is256BitVector() && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/128;
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
@@ -76,13 +73,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
if (Vec.getOpcode() == ISD::UNDEF)
return DAG.getUNDEF(ResultVT);
- // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
- // we can match to VEXTRACTF128.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
+ // This is the index of the first element of the vectorWidth-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
* ElemsPerChunk);
// If the input is a buildvector just emit a smaller one.
@@ -95,38 +91,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
VecIdx);
return Result;
+
+}
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
}
-/// Generate a DAG to put 128-bits into a vector > 128 bits. This
-/// sets things up to match to an AVX VINSERTF128 instruction or a
-/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit bounday. That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- DebugLoc dl) {
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
// Inserting UNDEF is Result
if (Vec.getOpcode() == ISD::UNDEF)
return Result;
-
EVT VT = Vec.getValueType();
- assert(VT.is128BitVector() && "Unexpected vector size!");
-
EVT ElVT = VT.getVectorElementType();
EVT ResultVT = Result.getValueType();
- // Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
+ // This is the index of the first element of the vectorWidth-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
* ElemsPerChunk);
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
VecIdx);
}
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
/// instructions. This is used because creating CONCAT_VECTOR nodes of
@@ -134,11 +163,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
/// large BUILD_VECTORS.
static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
unsigned NumElems, SelectionDAG &DAG,
- DebugLoc dl) {
+ SDLoc dl) {
SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
}
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
bool is64Bit = Subtarget->is64Bit();
@@ -163,10 +199,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
-
- RegInfo = TM.getRegisterInfo();
TD = getDataLayout();
+ resetOperationActions();
+}
+
+void X86TargetLowering::resetOperationActions() {
+ const TargetMachine &TM = getTargetMachine();
+ static bool FirstTimeThrough = true;
+
+ // If none of the target options have changed, then we don't need to reset the
+ // operation actions.
+ if (!FirstTimeThrough && TO == TM.Options) return;
+
+ if (!FirstTimeThrough) {
+ // Reinitialize the actions.
+ initActions();
+ FirstTimeThrough = false;
+ }
+
+ TO = TM.Options;
+
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -184,6 +237,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2
@@ -470,7 +525,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SETCC , MVT::i64 , Custom);
}
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
- // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
// support continuation, user-level threading, and etc.. As a result, no
// other SjLj exception interfaces are implemented and please don't build
@@ -508,16 +563,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (Subtarget->hasSSE1())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
- setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
- // On X86 and X86-64, atomic operations are lowered to locked instructions.
- // Locked instructions, in turn, have implicit fence semantics (all memory
- // operations are flushed before issuing the locked instruction, and they
- // are not buffered), so we can fold away the common pattern of
- // fence-atomic-fence.
- setShouldFoldAtomicFences(true);
-
// Expand certain atomics
for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
@@ -552,10 +599,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
- setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
- setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
- setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
- setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
if (Subtarget->is64Bit()) {
setExceptionPointerRegister(X86::RAX);
setExceptionSelectorRegister(X86::RDX);
@@ -575,10 +618,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
- if (Subtarget->is64Bit()) {
+ if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
+ // TargetInfo::X86_64ABIBuiltinVaList
setOperationAction(ISD::VAARG , MVT::Other, Custom);
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
} else {
+ // TargetInfo::CharPtrBuiltinVaList
setOperationAction(ISD::VAARG , MVT::Other, Expand);
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
}
@@ -989,7 +1034,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
}
- if (Subtarget->hasSSE41()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
@@ -1053,23 +1098,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRA, MVT::v8i16, Custom);
setOperationAction(ISD::SRA, MVT::v16i8, Custom);
- if (Subtarget->hasInt256()) {
- setOperationAction(ISD::SRL, MVT::v2i64, Legal);
- setOperationAction(ISD::SRL, MVT::v4i32, Legal);
-
- setOperationAction(ISD::SHL, MVT::v2i64, Legal);
- setOperationAction(ISD::SHL, MVT::v4i32, Legal);
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i32, Custom);
- setOperationAction(ISD::SRA, MVT::v4i32, Legal);
- } else {
- setOperationAction(ISD::SRL, MVT::v2i64, Custom);
- setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+ setOperationAction(ISD::SHL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i32, Custom);
- setOperationAction(ISD::SHL, MVT::v2i64, Custom);
- setOperationAction(ISD::SHL, MVT::v4i32, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i32, Custom);
- setOperationAction(ISD::SRA, MVT::v4i32, Custom);
- }
setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
}
@@ -1118,6 +1156,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
@@ -1186,14 +1225,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
- setOperationAction(ISD::SRL, MVT::v4i64, Legal);
- setOperationAction(ISD::SRL, MVT::v8i32, Legal);
-
- setOperationAction(ISD::SHL, MVT::v4i64, Legal);
- setOperationAction(ISD::SHL, MVT::v8i32, Legal);
-
- setOperationAction(ISD::SRA, MVT::v8i32, Legal);
-
setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
@@ -1210,15 +1241,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::MUL, MVT::v8i32, Custom);
setOperationAction(ISD::MUL, MVT::v16i16, Custom);
// Don't lower v32i8 because there is no 128-bit byte mul
+ }
- setOperationAction(ISD::SRL, MVT::v4i64, Custom);
- setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+ // In the customized shift lowering, the legal cases in AVX2 will be
+ // recognized.
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
- setOperationAction(ISD::SHL, MVT::v4i64, Custom);
- setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
- setOperationAction(ISD::SRA, MVT::v8i32, Custom);
- }
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
// Custom lower several nodes for 256-bit types.
for (int i = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1264,6 +1297,150 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
}
+ if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v16f32, Legal);
+ setOperationAction(ISD::SDIV, MVT::v16i32, Custom);
+
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
+
+ setOperationAction(ISD::ADD, MVT::v8i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::AND, MVT::v8i64, Legal);
+ setOperationAction(ISD::OR, MVT::v8i64, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i64, Legal);
+
+ // Custom lower several nodes.
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ // Extract subvector is special because the value type
+ // (result) is 256/128-bit but the source is 512-bit wide.
+ if (VT.is128BitVector() || VT.is256BitVector())
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ if (VT.getVectorElementType() == MVT::i1)
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ // Do not attempt to custom lower other non-512-bit vectors
+ if (!VT.is512BitVector())
+ continue;
+
+ if (VT != MVT::v8i64) {
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v8i64);
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v8i64);
+ }
+ if ( EltSize >= 32) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ }
+ }
+ for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ // Do not attempt to promote non-256-bit vectors
+ if (!VT.is512BitVector())
+ continue;
+
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v8i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+ }
+ }// has AVX-512
+
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1356,7 +1533,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
setPrefLoopAlignment(4); // 2^4 bytes.
- BenefitFromCodePlacementOpt = true;
// Predictable cmov don't hurt on atom because it's in-order.
PredictableSelectIsExpensive = !Subtarget->isAtom();
@@ -1364,7 +1540,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setPrefFunctionAlignment(4); // 2^4 bytes.
}
-EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
if (!VT.isVector()) return MVT::i8;
return VT.changeVectorElementTypeToInteger();
}
@@ -1507,9 +1683,9 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget->is64Bit())
- // This doesn't have DebugLoc associated with it, but is not really the
+ // This doesn't have SDLoc associated with it, but is not really the
// same as a Register.
- return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+ return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
return Table;
}
@@ -1596,7 +1772,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- DebugLoc dl, SelectionDAG &DAG) const {
+ SDLoc dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
@@ -1679,10 +1855,11 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// The x86-64 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (Subtarget->is64Bit() &&
- DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+ if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
unsigned Reg = FuncInfo->getSRetReturnReg();
@@ -1690,12 +1867,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
"SRetReturnReg should have been set in LowerFormalArguments().");
SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
- unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX;
+ unsigned RetValReg
+ = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+ X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
// RAX/EAX now acts like a return value.
- RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64));
+ RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
}
RetOps[0] = Chain; // Update chain.
@@ -1761,7 +1940,7 @@ SDValue
X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
// Assign locations to each value returned by this call.
@@ -1795,7 +1974,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
SDValue Ops[] = { Chain, InFlag };
Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
- MVT::Other, MVT::Glue, Ops, 2), 1);
+ MVT::Other, MVT::Glue, Ops), 1);
Val = Chain.getValue(0);
// Round the f80 to the right size, which also moves it to the appropriate
@@ -1868,7 +2047,7 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
static SDValue
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
- DebugLoc dl) {
+ SDLoc dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -1883,13 +2062,19 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
CC == CallingConv::HiPE);
}
+/// \brief Return true if the calling convention is a C calling convention.
+static bool IsCCallConvention(CallingConv::ID CC) {
+ return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
+ CC == CallingConv::X86_64_SysV);
+}
+
bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
return false;
CallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
- if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+ if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
return false;
return true;
@@ -1906,7 +2091,7 @@ SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain,
CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo *MFI,
unsigned i) const {
@@ -1948,7 +2133,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl,
+ SDLoc dl,
SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals)
const {
@@ -1964,7 +2149,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MachineFrameInfo *MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget->is64Bit();
bool IsWindows = Subtarget->isTargetWindows();
- bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
@@ -1975,9 +2160,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (IsWin64) {
+ if (IsWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
@@ -2003,12 +2187,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = &X86::FR64RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
RC = &X86::VR256RegClass;
else if (RegVT.is128BitVector())
RC = &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
else
llvm_unreachable("Unknown argument type!");
@@ -2049,9 +2239,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
// The x86-64 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
// Save the argument into a virtual register so that we can access it
// from the return points.
- if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
+ if (MF.getFunction()->hasStructRetAttr() &&
+ (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
@@ -2223,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SDValue
X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
SDValue StackPtr, SDValue Arg,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
@@ -2243,7 +2435,7 @@ SDValue
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
SDValue &OutRetAddr, SDValue Chain,
bool IsTailCall, bool Is64Bit,
- int FPDiff, DebugLoc dl) const {
+ int FPDiff, SDLoc dl) const {
// Adjust the Return address stack slot.
EVT VT = getPointerTy();
OutRetAddr = getReturnAddressFrameIndex(DAG);
@@ -2259,12 +2451,13 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
static SDValue
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
- unsigned SlotSize, int FPDiff, DebugLoc dl) {
+ unsigned SlotSize, int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
- MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(NewReturnAddrFI),
@@ -2276,10 +2469,10 @@ SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
- DebugLoc &dl = CLI.DL;
- SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
- SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
- SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
CallingConv::ID CallConv = CLI.CallConv;
@@ -2288,7 +2481,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
- bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
bool IsWindows = Subtarget->isTargetWindows();
StructReturnType SR = callIsStructReturn(Outs);
bool IsSibcall = false;
@@ -2321,9 +2514,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (IsWin64) {
+ if (IsWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
@@ -2352,7 +2544,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!IsSibcall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
@@ -2366,6 +2559,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
EVT RegVT = VA.getLocVT();
@@ -2441,7 +2636,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// GOT pointer.
if (!isTailCall) {
RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
- DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
@@ -2638,7 +2833,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!IsSibcall && isTailCall) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
- DAG.getIntPtrConstant(0, true), InFlag);
+ DAG.getIntPtrConstant(0, true), InFlag, dl);
InFlag = Chain.getValue(1);
}
@@ -2697,7 +2892,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getIntPtrConstant(NumBytes, true),
DAG.getIntPtrConstant(NumBytesForCalleeToPush,
true),
- InFlag);
+ InFlag, dl);
InFlag = Chain.getValue(1);
}
@@ -2745,6 +2940,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const TargetMachine &TM = MF.getTarget();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
const TargetFrameLowering &TFI = *TM.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
@@ -2829,13 +3026,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG &DAG) const {
- if (!IsTailCallConvention(CalleeCC) &&
- CalleeCC != CallingConv::C)
+ if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
const MachineFunction &MF = DAG.getMachineFunction();
- const Function *CallerF = DAG.getMachineFunction().getFunction();
+ const Function *CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
@@ -2845,6 +3041,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
+ bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
@@ -2857,6 +3055,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
if (RegInfo->needsStackRealignment(MF))
return false;
@@ -2876,7 +3076,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
- if (Subtarget->isTargetWin64())
+ if (IsCalleeWin64 || IsCallerWin64)
return false;
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2951,9 +3151,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
getTargetMachine(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (Subtarget->isTargetWin64()) {
+ if (IsCalleeWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
if (CCInfo.getNextStackOffset()) {
@@ -3060,7 +3259,7 @@ static bool isTargetShuffle(unsigned Opcode) {
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3071,7 +3270,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3085,7 +3284,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, SDValue V2, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3098,7 +3297,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, SDValue V2, SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3117,13 +3316,16 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
- ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
@@ -3626,7 +3828,7 @@ static
SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
MVT VT = SVOp->getValueType(0).getSimpleVT();
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
if (VT != MVT::v8i32 && VT != MVT::v8f32)
return SDValue();
@@ -3684,12 +3886,10 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
- i != (l+1)*NumLaneElts;
- i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
@@ -3723,11 +3923,10 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
- i != (l+1)*NumLaneElts; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
@@ -3768,12 +3967,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
- i != (l+1)*NumLaneElts;
- i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
@@ -3803,11 +4000,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
- i != (l+1)*NumLaneElts; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (!isUndefOrEqual(BitI1, j))
@@ -4039,42 +4235,59 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
return true;
}
-/// isVEXTRACTF128Index - Return true if the specified
+/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-/// suitable for input to VEXTRACTF128.
-bool X86::isVEXTRACTF128Index(SDNode *N) {
+/// suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
return false;
- // The index should be aligned on a 128-bit boundary.
+ // The index should be aligned on a vecWidth-bit boundary.
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
MVT VT = N->getValueType(0).getSimpleVT();
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
- bool Result = (Index * ElSize) % 128 == 0;
+ bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
}
-/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
+/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
/// operand specifies a subvector insert that is suitable for input to
-/// VINSERTF128.
-bool X86::isVINSERTF128Index(SDNode *N) {
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
return false;
-
- // The index should be aligned on a 128-bit boundary.
+ // The index should be aligned on a vecWidth-bit boundary.
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
MVT VT = N->getValueType(0).getSimpleVT();
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
- bool Result = (Index * ElSize) % 128 == 0;
+ bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
}
+bool X86::isVINSERT128Index(SDNode *N) {
+ return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+ return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 256);
+}
+
/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
/// Handles 128-bit and 256-bit.
@@ -4178,12 +4391,10 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
return (Val - i) * EltSize;
}
-/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
-/// instructions.
-unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
- llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
+ llvm_unreachable("Illegal extract subvector for VEXTRACT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
@@ -4191,16 +4402,14 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
MVT ElVT = VecVT.getVectorElementType();
- unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
}
-/// getInsertVINSERTF128Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
-/// instructions.
-unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
- llvm_unreachable("Illegal insert subvector for VINSERTF128");
+ llvm_unreachable("Illegal insert subvector for VINSERT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
@@ -4208,10 +4417,38 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
MVT VecVT = N->getValueType(0).getSimpleVT();
MVT ElVT = VecVT.getVectorElementType();
- unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
}
+/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// getInsertVINSERT128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 128);
+}
+
+/// getInsertVINSERT256Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 256);
+}
+
/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
/// Handles 256-bit.
@@ -4261,7 +4498,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
}
MaskVec.push_back(Idx);
}
- return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+ return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
SVOp->getOperand(0), &MaskVec[0]);
}
@@ -4394,7 +4631,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
- SelectionDAG &DAG, DebugLoc dl) {
+ SelectionDAG &DAG, SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
// Always build SSE zero vectors as <4 x i32> bitcasted
@@ -4412,13 +4649,15 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
if (Subtarget->hasInt256()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
+ array_lengthof(Ops));
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
+ array_lengthof(Ops));
}
} else
llvm_unreachable("Unexpected vector type");
@@ -4431,7 +4670,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
/// Then bitcast to their original type, ensuring they get CSE'd.
static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
- DebugLoc dl) {
+ SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
@@ -4439,7 +4678,8 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
+ array_lengthof(Ops));
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -4464,7 +4704,7 @@ static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
@@ -4475,7 +4715,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
}
/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
@@ -4487,7 +4727,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
}
/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
@@ -4505,7 +4745,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
EVT VT = V.getValueType();
int NumElems = VT.getVectorNumElements();
- DebugLoc dl = V.getDebugLoc();
+ SDLoc dl(V);
while (NumElems > 4) {
if (EltNo < NumElems/2) {
@@ -4522,7 +4762,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
/// getLegalSplat - Generate a legal splat with supported x86 shuffles
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
EVT VT = V.getValueType();
- DebugLoc dl = V.getDebugLoc();
+ SDLoc dl(V);
if (VT.is128BitVector()) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
@@ -4549,7 +4789,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
EVT SrcVT = SV->getValueType(0);
SDValue V1 = SV->getOperand(0);
- DebugLoc dl = SV->getDebugLoc();
+ SDLoc dl(SV);
int EltNo = SV->getSplatIndex();
int NumElems = SrcVT.getVectorNumElements();
@@ -4594,13 +4834,13 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
SelectionDAG &DAG) {
EVT VT = V2.getValueType();
SDValue V1 = IsZero
- ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec;
for (unsigned i = 0; i != NumElems; ++i)
// If this is the insertion idx, put the low elt of V2 here.
MaskVec.push_back(i == Idx ? NumElems : i);
- return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+ return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
}
/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
@@ -4751,19 +4991,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
/// getNumOfConsecutiveZeros - Return the number of elements of a vector
/// shuffle operation which come from a consecutively from a zero. The
/// search can start in two different directions, from left or right.
-static
-unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
- bool ZerosFromLeft, SelectionDAG &DAG) {
- unsigned i;
- for (i = 0; i != NumElems; ++i) {
- unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+/// We count undefs as zeros until PreferredNum is reached.
+static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
+ unsigned NumElems, bool ZerosFromLeft,
+ SelectionDAG &DAG,
+ unsigned PreferredNum = -1U) {
+ unsigned NumZeros = 0;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
- if (!(Elt.getNode() &&
- (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+ if (!Elt.getNode())
+ break;
+
+ if (X86::isZeroNode(Elt))
+ ++NumZeros;
+ else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
+ NumZeros = std::min(NumZeros + 1, PreferredNum);
+ else
break;
}
- return i;
+ return NumZeros;
}
/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
@@ -4801,8 +5049,9 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
- false /* check zeros from right */, DAG);
+ unsigned NumZeros = getNumOfConsecutiveZeros(
+ SVOp, NumElems, false /* check zeros from right */, DAG,
+ SVOp->getMaskElt(0));
unsigned OpSrc;
if (!NumZeros)
@@ -4834,8 +5083,9 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
- true /* check zeros from left */, DAG);
+ unsigned NumZeros = getNumOfConsecutiveZeros(
+ SVOp, NumElems, true /* check zeros from left */, DAG,
+ NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
unsigned OpSrc;
if (!NumZeros)
@@ -4888,7 +5138,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
if (NumNonZero > 8)
return SDValue();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue V(0, 0);
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
@@ -4936,7 +5186,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
if (NumNonZero > 4)
return SDValue();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue V(0, 0);
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
@@ -4962,7 +5212,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
///
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
- const TargetLowering &TLI, DebugLoc dl) {
+ const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
EVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
@@ -4974,7 +5224,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
}
SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
SelectionDAG &DAG) const {
// Check if the scalar load can be widened into a vector load. And if
@@ -5027,7 +5277,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
return SDValue();
int64_t StartOffset = Offset & ~(RequiredAlign-1);
if (StartOffset)
- Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+ Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
@@ -5038,10 +5288,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- Mask.push_back(EltNo);
-
+ SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
@@ -5058,7 +5305,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
- DebugLoc &DL, SelectionDAG &DAG) {
+ SDLoc &DL, SelectionDAG &DAG) {
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
@@ -5094,22 +5341,35 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ SDValue NewLd = SDValue();
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
- return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), 0);
- return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), LDBase->getAlignment());
+ NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->isInvariant(), 0);
+ NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getPointerInfo(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->isInvariant(), LDBase->getAlignment());
+
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(NewLd.getNode(), 1));
+ }
+
+ return NewLd;
}
if (NumElems == 4 && LastLoadedElt == 1 &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+ array_lengthof(Ops), MVT::i64,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
@@ -5144,9 +5404,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
MVT VT = Op.getValueType().getSimpleVT();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
SDValue Ld;
@@ -5202,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
// The scalar_to_vector node and the suspected
// load node must have exactly one user.
// Constants may have multiple users.
- if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+
+ // AVX-512 has register version of the broadcast
+ bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+ Ld.getValueType().getSizeInBits() >= 32;
+ if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+ !hasRegVer))
return SDValue();
break;
}
}
- bool Is256 = VT.is256BitVector();
+ bool IsGE256 = (VT.getSizeInBits() >= 256);
// Handle the broadcasting a single constant scalar from the constant pool
// into a vector. On Sandybridge it is still better to load a constant vector
@@ -5218,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
assert(!CVT.isVector() && "Must not broadcast a vector type");
unsigned ScalarSize = CVT.getSizeInBits();
- if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
const Constant *C = 0;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
@@ -5242,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget->hasInt256() &&
- (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+ (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The scalar source must be a normal load.
if (!IsLoad)
return SDValue();
- if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
@@ -5271,7 +5536,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
return SDValue();
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
unsigned NumElems = Op.getNumOperands();
SDValue VecIn1;
@@ -5337,14 +5602,120 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
return NV;
}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+ EVT VT = Op.getValueType();
+ assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+ "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+ SDLoc dl(Op);
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+ Ops, VT.getVectorNumElements());
+ }
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+ Ops, VT.getVectorNumElements());
+ }
+
+ bool AllContants = true;
+ uint64_t Immediate = 0;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.getOpcode() == ISD::UNDEF)
+ continue;
+ if (!isa<ConstantSDNode>(In)) {
+ AllContants = false;
+ break;
+ }
+ if (cast<ConstantSDNode>(In)->getZExtValue())
+ Immediate |= (1ULL << idx);
+ }
+
+ if (AllContants) {
+ SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
+ DAG.getConstant(Immediate, MVT::i16));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
+ DAG.getIntPtrConstant(0));
+ }
+
+ if (!isSplatVector(Op.getNode()))
+ llvm_unreachable("Unsupported predicate operation");
+
+ SDValue In = Op.getOperand(0);
+ SDValue EFLAGS, X86CC;
+ if (In.getOpcode() == ISD::SETCC) {
+ SDValue Op0 = In.getOperand(0);
+ SDValue Op1 = In.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
+ bool isFP = Op1.getValueType().isFloatingPoint();
+ unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+
+ assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
+
+ X86CC = DAG.getConstant(X86CCVal, MVT::i8);
+ EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ } else if (In.getOpcode() == X86ISD::SETCC) {
+ X86CC = In.getOperand(0);
+ EFLAGS = In.getOperand(1);
+ } else {
+ // The algorithm:
+ // Bit1 = In & 0x1
+ // if (Bit1 != 0)
+ // ZF = 0
+ // else
+ // ZF = 1
+ // if (ZF == 0)
+ // res = allOnes ### CMOVNE -1, %res
+ // else
+ // res = allZero
+ MVT InVT = In.getValueType().getSimpleVT();
+ SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
+ EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
+ X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ }
+
+ if (VT == MVT::v16i1) {
+ SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
+ SDValue Cst0 = DAG.getConstant(0, MVT::i16);
+ SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
+ Cst0, Cst1, X86CC, EFLAGS);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+ }
+
+ if (VT == MVT::v8i1) {
+ SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
+ SDValue Cst0 = DAG.getConstant(0, MVT::i32);
+ SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
+ Cst0, Cst1, X86CC, EFLAGS);
+ CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+ }
+ llvm_unreachable("Unsupported predicate operation");
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT VT = Op.getValueType().getSimpleVT();
MVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
+ // Generate vectors for predicate vectors.
+ if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG);
+
// Vectors containing all zeros can be matched by pxor and xorps later
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
@@ -5398,7 +5769,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
- unsigned Idx = CountTrailingZeros_32(NonZeros);
+ unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
// If this is an insertion of an i64 value on x86-32, and if the top bits of
@@ -5507,7 +5878,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
- unsigned Idx = CountTrailingZeros_32(NonZeros);
+ unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -5542,7 +5913,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
- unsigned Idx = CountTrailingZeros_32(NonZeros);
+ unsigned Idx = countTrailingZeros(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -5672,22 +6043,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT ResVT = Op.getValueType().getSimpleVT();
- assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
+ assert((ResVT.is256BitVector() ||
+ ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
+ if(ResVT.is256BitVector())
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
- return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 2);
- // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
@@ -5698,7 +6072,7 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
MVT VT = SVOp->getValueType(0).getSimpleVT();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
@@ -5759,7 +6133,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
SmallVector<int, 8> MaskVals;
// Determine if more than 1 of the words in each of the low and high quadwords
@@ -6014,7 +6388,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
const X86TargetLowering &TLI) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
ArrayRef<int> MaskVals = SVOp->getMask();
// Promote splats to a larger type which usually leads to more efficient code.
@@ -6143,7 +6517,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
MVT VT = SVOp->getValueType(0).getSimpleVT();
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6189,7 +6563,7 @@ static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
MVT VT = SVOp->getValueType(0).getSimpleVT();
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
unsigned NumElems = VT.getVectorNumElements();
MVT NewVT;
unsigned Scale;
@@ -6227,7 +6601,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
///
static SDValue getVZextMovL(MVT VT, EVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
- const X86Subtarget *Subtarget, DebugLoc dl) {
+ const X86Subtarget *Subtarget, SDLoc dl) {
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
LoadSDNode *LD = NULL;
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
@@ -6272,7 +6646,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLaneElems = NumElems / 2;
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
MVT EltVT = VT.getVectorElementType();
MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
SDValue Output[2];
@@ -6378,7 +6752,7 @@ static SDValue
LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- DebugLoc dl = SVOp->getDebugLoc();
+ SDLoc dl(SVOp);
MVT VT = SVOp->getValueType(0).getSimpleVT();
assert(VT.is128BitVector() && "Unsupported vector size");
@@ -6529,7 +6903,7 @@ static bool MayFoldVectorLoad(SDValue V) {
}
static
-SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
+SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Canonizalize to v2f64.
@@ -6540,7 +6914,7 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
}
static
-SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
@@ -6559,7 +6933,7 @@ SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
}
static
-SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
@@ -6575,7 +6949,7 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
}
static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
@@ -6645,7 +7019,7 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = VT.getVectorNumElements();
@@ -6706,10 +7080,10 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
EVT FullVT = V.getValueType();
- EVT SubVecVT = EVT::getVectorVT(*Context,
+ EVT SubVecVT = EVT::getVectorVT(*Context,
FullVT.getVectorElementType(),
FullVT.getVectorNumElements()/Ratio);
- V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
DAG.getIntPtrConstant(0));
}
V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
@@ -6724,7 +7098,7 @@ SDValue
X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
MVT VT = Op.getValueType().getSimpleVT();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
@@ -6783,7 +7157,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getValueType().getSimpleVT();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
unsigned NumElems = VT.getVectorNumElements();
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6865,6 +7239,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
TargetMask, DAG);
}
+ if (isPALIGNRMask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+ getShufflePALIGNRImmediate(SVOp),
+ DAG);
+
// Check if this can be converted into a logical shift.
bool isLeft = false;
unsigned ShAmt = 0;
@@ -6982,11 +7361,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// inlined here right now to enable us to directly emit target specific
// nodes, and remove one by one until they don't return Op anymore.
- if (isPALIGNRMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
- getShufflePALIGNRImmediate(SVOp),
- DAG);
-
if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
SVOp->getSplatIndex() == 0 && V2IsUndef) {
if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -7094,7 +7468,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getValueType().getSimpleVT();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
return SDValue();
@@ -7157,6 +7531,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDLoc dl(Op);
if (!isa<ConstantSDNode>(Op.getOperand(1)))
return SDValue();
@@ -7165,17 +7540,19 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
- if (VecVT.is256BitVector()) {
- DebugLoc dl = Op.getNode()->getDebugLoc();
- unsigned NumElems = VecVT.getVectorNumElements();
+ if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
SDValue Idx = Op.getOperand(1);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ EVT EltVT = VecVT.getVectorElementType();
+
+ unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
- if (IdxVal >= NumElems/2)
- IdxVal -= NumElems/2;
+ //if (IdxVal >= NumElems/2)
+ // IdxVal -= NumElems/2;
+ IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getConstant(IdxVal, MVT::i32));
}
@@ -7189,7 +7566,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
}
MVT VT = Op.getValueType().getSimpleVT();
- DebugLoc dl = Op.getDebugLoc();
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
SDValue Vec = Op.getOperand(0);
@@ -7248,7 +7624,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getValueType().getSimpleVT();
MVT EltVT = VT.getVectorElementType();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
@@ -7303,26 +7679,27 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getValueType().getSimpleVT();
MVT EltVT = VT.getVectorElementType();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
// If this is a 256-bit vector result, first extract the 128-bit vector,
// insert the element into the extracted half and then place it back.
- if (VT.is256BitVector()) {
+ if (VT.is256BitVector() || VT.is512BitVector()) {
if (!isa<ConstantSDNode>(N2))
return SDValue();
// Get the desired 128-bit vector half.
- unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired half.
- bool Upper = IdxVal >= NumElems/2;
+ unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
+ unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
+ DAG.getConstant(IdxIn128, MVT::i32));
// Insert the changed part back to the 256-bit vector
return Insert128BitVector(N0, V, IdxVal, DAG, dl);
@@ -7348,16 +7725,17 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
LLVMContext *Context = DAG.getContext();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT OpVT = Op.getValueType().getSimpleVT();
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
+ unsigned SizeFactor = OpVT.getSizeInBits()/128;
EVT VT128 = EVT::getVectorVT(*Context,
OpVT.getVectorElementType(),
- OpVT.getVectorNumElements() / 2);
+ OpVT.getVectorNumElements() / SizeFactor);
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
@@ -7380,16 +7758,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// upper bits of a vector.
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- DebugLoc dl = Op.getNode()->getDebugLoc();
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue Idx = Op.getNode()->getOperand(1);
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ EVT ResVT = Op.getValueType();
+ EVT InVT = In.getValueType();
- if (Op.getNode()->getValueType(0).is128BitVector() &&
- Vec.getNode()->getValueType(0).is256BitVector() &&
+ if (Subtarget->hasFp256()) {
+ if (ResVT.is128BitVector() &&
+ (InVT.is256BitVector() || InVT.is512BitVector()) &&
isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Extract128BitVector(Vec, IdxVal, DAG, dl);
+ return Extract128BitVector(In, IdxVal, DAG, dl);
+ }
+ if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ return Extract256BitVector(In, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7401,17 +7785,25 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
if (Subtarget->hasFp256()) {
- DebugLoc dl = Op.getNode()->getDebugLoc();
+ SDLoc dl(Op.getNode());
SDValue Vec = Op.getNode()->getOperand(0);
SDValue SubVec = Op.getNode()->getOperand(1);
SDValue Idx = Op.getNode()->getOperand(2);
- if (Op.getNode()->getValueType(0).is256BitVector() &&
+ if ((Op.getNode()->getValueType(0).is256BitVector() ||
+ Op.getNode()->getValueType(0).is512BitVector()) &&
SubVec.getNode()->getValueType(0).is128BitVector() &&
isa<ConstantSDNode>(Idx)) {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}
+
+ if (Op.getNode()->getValueType(0).is512BitVector() &&
+ SubVec.getNode()->getValueType(0).is256BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ }
}
return SDValue();
}
@@ -7443,13 +7835,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
CP->getAlignment(),
CP->getOffset(), OpFlag);
- DebugLoc DL = CP->getDebugLoc();
+ SDLoc DL(CP);
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag) {
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
+ SDLoc(), getPointerTy()),
Result);
}
@@ -7475,14 +7867,14 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
OpFlag);
- DebugLoc DL = JT->getDebugLoc();
+ SDLoc DL(JT);
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag)
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
+ SDLoc(), getPointerTy()),
Result);
return Result;
@@ -7513,7 +7905,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
// With PIC, the address is actually $g + Offset.
@@ -7521,7 +7913,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
!Subtarget->is64Bit()) {
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
+ SDLoc(), getPointerTy()),
Result);
}
@@ -7542,7 +7934,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
CodeModel::Model M = getTargetMachine().getCodeModel();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
OpFlags);
@@ -7563,7 +7955,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
}
SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
int64_t Offset, SelectionDAG &DAG) const {
// Create the TargetGlobalAddress node, folding in the constant
// offset if it is legal.
@@ -7612,7 +8004,7 @@ SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
- return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+ return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
}
static SDValue
@@ -7621,7 +8013,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
unsigned char OperandFlags, bool LocalDynamic = false) {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- DebugLoc dl = GA->getDebugLoc();
+ SDLoc dl(GA);
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(),
@@ -7632,10 +8024,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7650,10 +8042,10 @@ static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
SDValue InFlag;
- DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
+ SDLoc dl(GA); // ? function entry point might be better
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), PtrVT), InFlag);
+ SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7671,7 +8063,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SelectionDAG &DAG,
const EVT PtrVT,
bool is64Bit) {
- DebugLoc dl = GA->getDebugLoc();
+ SDLoc dl(GA);
// Get the start address of the TLS block for this module.
X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
@@ -7685,7 +8077,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
} else {
SDValue InFlag;
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
X86II::MO_TLSLDM, /*LocalDynamic=*/true);
@@ -7710,7 +8102,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT, TLSModel::Model model,
bool is64Bit, bool isPIC) {
- DebugLoc dl = GA->getDebugLoc();
+ SDLoc dl(GA);
// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
@@ -7749,7 +8141,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
if (model == TLSModel::InitialExec) {
if (isPIC && !is64Bit) {
Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
- DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+ DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
Offset);
}
@@ -7803,7 +8195,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
OpFlag = X86II::MO_TLVP_PIC_BASE;
else
OpFlag = X86II::MO_TLVP;
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
GA->getValueType(0),
GA->getOffset(), OpFlag);
@@ -7813,7 +8205,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
if (PIC32)
Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
+ SDLoc(), getPointerTy()),
Offset);
// Lowering the machine isd will make sure everything is in the right
@@ -7850,7 +8242,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// thread-localness.
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
GV = GA->resolveAliasedGlobal(false);
- DebugLoc dl = GA->getDebugLoc();
+ SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
@@ -7908,7 +8300,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
@@ -7945,7 +8337,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
}
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -7967,7 +8359,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
return Op;
}
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
unsigned Size = SrcVT.getSizeInBits()/8;
MachineFunction &MF = DAG.getMachineFunction();
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
@@ -7983,7 +8375,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue StackSlot,
SelectionDAG &DAG) const {
// Build the FILD
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
SDVTList Tys;
bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
if (useSSE)
@@ -8058,11 +8450,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
#endif
*/
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
- const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
@@ -8112,7 +8504,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
SelectionDAG &DAG) const {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
// FP constant to bias correct the final result.
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
MVT::f64);
@@ -8160,7 +8552,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
EVT SVT = N0.getValueType();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
@@ -8175,7 +8567,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
if (Op.getValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG);
@@ -8228,13 +8620,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
- SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
- MVT::i64, MMO);
+ SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+ array_lengthof(Ops), MVT::i64, MMO);
APInt FF(32, 0x5F800000ULL);
// Check whether the sign bit is set.
- SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+ SDValue SignSet = DAG.getSetCC(dl,
+ getSetCCResultType(*DAG.getContext(), MVT::i64),
Op.getOperand(0), DAG.getConstant(0, MVT::i64),
ISD::SETLT);
@@ -8263,7 +8656,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
std::pair<SDValue,SDValue>
X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool IsSigned, bool IsReplace) const {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
EVT DstTy = Op.getValueType();
@@ -8321,8 +8714,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
- DstTy, MMO);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
+ array_lengthof(Ops), DstTy, MMO);
Chain = Value.getValue(1);
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -8336,7 +8729,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, 3, DstTy, MMO);
+ Ops, array_lengthof(Ops), DstTy,
+ MMO);
return std::make_pair(FIST, StackSlot);
} else {
SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -8348,8 +8742,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MVT::i32, eax.getValue(2));
SDValue Ops[] = { eax, edx };
SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
- : DAG.getMergeValues(Ops, 2, DL);
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
+ : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
return std::make_pair(pair, SDValue());
}
}
@@ -8359,7 +8753,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT VT = Op->getValueType(0).getSimpleVT();
SDValue In = Op->getOperand(0);
MVT InVT = In.getValueType().getSimpleVT();
- DebugLoc dl = Op->getDebugLoc();
+ SDLoc dl(Op);
// Optimize vectors in AVX mode:
//
@@ -8408,7 +8802,7 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
}
SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
MVT VT = Op.getValueType().getSimpleVT();
SDValue In = Op.getOperand(0);
MVT SVT = In.getValueType().getSimpleVT();
@@ -8440,7 +8834,7 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
}
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
MVT VT = Op.getValueType().getSimpleVT();
SDValue In = Op.getOperand(0);
MVT SVT = In.getValueType().getSimpleVT();
@@ -8561,8 +8955,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
MVT VT = Op.getValueType().getSimpleVT();
if (VT.isVector()) {
if (VT == MVT::v8i16)
- return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
- DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
+ DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
MVT::v8i32, Op.getOperand(0)));
return SDValue();
}
@@ -8575,7 +8969,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
if (StackSlot.getNode())
// Load the result.
- return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
@@ -8592,7 +8986,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
if (StackSlot.getNode())
// Load the result.
- return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ return DAG.getLoad(Op.getValueType(), SDLoc(Op),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
@@ -8601,7 +8995,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
}
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
MVT VT = Op.getValueType().getSimpleVT();
SDValue In = Op.getOperand(0);
MVT SVT = In.getValueType().getSimpleVT();
@@ -8615,7 +9009,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT VT = Op.getValueType().getSimpleVT();
MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
@@ -8649,7 +9043,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT VT = Op.getValueType().getSimpleVT();
MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
@@ -8686,7 +9080,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT VT = Op.getValueType().getSimpleVT();
MVT SrcVT = Op1.getValueType().getSimpleVT();
@@ -8763,7 +9157,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
MVT VT = Op.getValueType().getSimpleVT();
// Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
@@ -8785,7 +9179,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
return SDValue();
SDNode *N = Op.getNode();
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, unsigned> VecInMap;
@@ -8797,7 +9191,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
Opnds.push_back(N->getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
- SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+ SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all OR'd operands.
if (I->getOpcode() == ISD::OR) {
Opnds.push_back(I->getOperand(0));
@@ -8869,7 +9263,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
SelectionDAG &DAG) const {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
// CF and OF aren't always set the way we want. Determine which
// of these we need.
@@ -9084,7 +9478,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if (C->getAPIntValue() == 0)
return EmitTest(Op0, X86CC, DAG);
- DebugLoc dl = Op0.getDebugLoc();
+ SDLoc dl(Op0);
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
// Use SUB instead of CMP to enable CSE between SUB and CMP.
@@ -9111,7 +9505,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
- DebugLoc dl = Cmp.getDebugLoc();
+ SDLoc dl(Cmp);
SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
@@ -9128,7 +9522,7 @@ static bool isAllOnes(SDValue V) {
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
/// if it's possible.
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
- DebugLoc dl, SelectionDAG &DAG) const {
+ SDLoc dl, SelectionDAG &DAG) const {
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -9173,14 +9567,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
}
if (LHS.getNode()) {
- // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
- // the condition code later.
- bool Invert = false;
- if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
- Invert = true;
- LHS = LHS.getOperand(0);
- }
-
// If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
// instruction. Since the shift amount is in-range-or-undefined, we know
// that doing a bittest on the i32 value is ok. We extend to i32 because
@@ -9197,9 +9583,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
- // Flip the condition if the LHS was a not instruction
- if (Invert)
- Cond = X86::GetOppositeBranchCondition(Cond);
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(Cond, MVT::i8), BT);
}
@@ -9207,6 +9590,51 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
return SDValue();
}
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1) {
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; // Fallthrough
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; // Fallthrough
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ return SSECC;
+}
+
// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
// ones, and then concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
@@ -9216,7 +9644,7 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
"Unsupported value type for operation");
unsigned NumElems = VT.getVectorNumElements();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue CC = Op.getOperand(2);
// Extract the LHS vectors
@@ -9246,7 +9674,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
MVT VT = Op.getValueType().getSimpleVT();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
if (isFP) {
#ifndef NDEBUG
@@ -9254,43 +9682,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
- unsigned SSECC;
- bool Swap = false;
-
- // SSE Condition code mapping:
- // 0 - EQ
- // 1 - LT
- // 2 - LE
- // 3 - UNORD
- // 4 - NEQ
- // 5 - NLT
- // 6 - NLE
- // 7 - ORD
- switch (SetCCOpcode) {
- default: llvm_unreachable("Unexpected SETCC condition");
- case ISD::SETOEQ:
- case ISD::SETEQ: SSECC = 0; break;
- case ISD::SETOGT:
- case ISD::SETGT: Swap = true; // Fallthrough
- case ISD::SETLT:
- case ISD::SETOLT: SSECC = 1; break;
- case ISD::SETOGE:
- case ISD::SETGE: Swap = true; // Fallthrough
- case ISD::SETLE:
- case ISD::SETOLE: SSECC = 2; break;
- case ISD::SETUO: SSECC = 3; break;
- case ISD::SETUNE:
- case ISD::SETNE: SSECC = 4; break;
- case ISD::SETULE: Swap = true; // Fallthrough
- case ISD::SETUGE: SSECC = 5; break;
- case ISD::SETULT: Swap = true; // Fallthrough
- case ISD::SETUGT: SSECC = 6; break;
- case ISD::SETO: SSECC = 7; break;
- case ISD::SETUEQ:
- case ISD::SETONE: SSECC = 8; break;
- }
- if (Swap)
- std::swap(Op0, Op1);
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
// In the two special cases we can't handle, emit two comparisons.
if (SSECC == 8) {
@@ -9322,8 +9714,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
unsigned Opc;
- bool Swap = false, Invert = false, FlipSigns = false;
-
+ bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETNE: Invert = true;
@@ -9337,20 +9729,77 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
case ISD::SETUGE: Swap = true;
case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
}
+
+ // Special case: Use min/max operations for SETULE/SETUGE
+ MVT VET = VT.getVectorElementType();
+ bool hasMinMax =
+ (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+ || (Subtarget->hasSSE2() && (VET == MVT::i8));
+
+ if (hasMinMax) {
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
+ case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+ }
+
+ if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+ }
+
if (Swap)
std::swap(Op0, Op1);
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
if (VT == MVT::v2i64) {
- if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
- return SDValue();
+ if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
+ assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+
+ // First cast everything to the right type.
+ Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+ } else {
+ SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+ SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+ Sign, Zero, Sign, Zero);
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+ // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+ // Create masks for only the low parts/high parts of the 64 bit integers.
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskLo[] = { 0, 0, 2, 2 };
+ SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+ SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+ SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+ Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+
if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
// pcmpeqd + pshufd + pand.
assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
- // First cast everything to the right type,
+ // First cast everything to the right type.
Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
@@ -9358,7 +9807,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
// Make sure the lower and upper halves are both all-ones.
- const int Mask[] = { 1, 0, 3, 2 };
+ static const int Mask[] = { 1, 0, 3, 2 };
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
@@ -9369,17 +9818,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
}
}
- // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
EVT EltVT = VT.getVectorElementType();
- SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
- EltVT);
- std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
- SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
- SignBits.size());
- Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
- Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
}
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -9387,6 +9832,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
+
+ if (MinMax)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
return Result;
}
@@ -9400,7 +9848,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
// Optimize to BT if possible.
@@ -9494,9 +9942,31 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
+ EVT VT = Op1.getValueType();
SDValue CC;
+ // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available. Otherwise fp cmovs get lowered into a less efficient branch
+ // sequence later on.
+ if (Cond.getOpcode() == ISD::SETCC &&
+ ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+ VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+ SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+ int SSECC = translateX86FSETCC(
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+ if (SSECC != 8) {
+ unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
+ SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, MVT::i8));
+ SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+ SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+ return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+ }
+ }
+
if (Cond.getOpcode() == ISD::SETCC) {
SDValue NewCond = LowerSETCC(Cond, DAG);
if (NewCond.getNode())
@@ -9684,7 +10154,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
MVT VT = Op->getValueType(0).getSimpleVT();
SDValue In = Op->getOperand(0);
MVT InVT = In.getValueType().getSimpleVT();
- DebugLoc dl = Op->getDebugLoc();
+ SDLoc dl(Op);
if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
(VT != MVT::v8i32 || InVT != MVT::v8i16))
@@ -9757,7 +10227,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue CC;
bool Inverted = false;
@@ -10027,7 +10497,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
"This should be used only on Windows targets or when segmented stacks "
"are being used");
assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
// Get the inputs.
SDValue Chain = Op.getOperand(0);
@@ -10072,6 +10542,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
Flag = Chain.getValue(1);
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
SPTy).getValue(1);
@@ -10085,7 +10557,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -10152,7 +10624,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue SrcPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
unsigned Align = Op.getConstantOperandVal(3);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
EVT ArgVT = Op.getNode()->getValueType(0);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -10218,7 +10690,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
SDValue SrcPtr = Op.getOperand(2);
const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
@@ -10228,7 +10700,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
// getTargetVShiftNode - Handle vector element shifts where the shift amount
// may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue SrcOp, SDValue ShAmt,
SelectionDAG &DAG) {
assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
@@ -10272,7 +10744,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
}
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
@@ -10822,8 +11294,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
X86CC = X86::COND_E;
break;
}
- SmallVector<SDValue, 5> NewOps;
- NewOps.append(Op->op_begin()+1, Op->op_end());
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -10840,8 +11311,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
else
Opcode = X86ISD::PCMPESTRI;
- SmallVector<SDValue, 5> NewOps;
- NewOps.append(Op->op_begin()+1, Op->op_end());
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
}
@@ -10917,33 +11387,52 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- // RDRAND intrinsics.
+ // RDRAND/RDSEED intrinsics.
case Intrinsic::x86_rdrand_16:
case Intrinsic::x86_rdrand_32:
- case Intrinsic::x86_rdrand_64: {
+ case Intrinsic::x86_rdrand_64:
+ case Intrinsic::x86_rdseed_16:
+ case Intrinsic::x86_rdseed_32:
+ case Intrinsic::x86_rdseed_64: {
+ unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
+ IntNo == Intrinsic::x86_rdseed_32 ||
+ IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
+ X86ISD::RDRAND;
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
- SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+ SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
- // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
- // return the value from Rand, which is always 0, casted to i32.
+ // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+ // Otherwise return the value from Rand, which is always 0, casted to i32.
SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
DAG.getConstant(1, Op->getValueType(1)),
DAG.getConstant(X86::COND_B, MVT::i32),
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
DAG.getVTList(Op->getValueType(1), MVT::Glue),
- Ops, 4);
+ Ops, array_lengthof(Ops));
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
+
+ // XTEST intrinsics.
+ case Intrinsic::x86_xtest: {
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86::COND_NE, MVT::i8),
+ InTrans);
+ SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ Ret, SDValue(InTrans.getNode(), 1));
+ }
}
}
@@ -10953,13 +11442,14 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
MFI->setReturnAddressIsTaken(true);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
EVT PtrVT = getPointerTy();
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- SDValue Offset =
- DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
FrameAddr, Offset),
@@ -10977,9 +11467,14 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
MFI->setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
- DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
+ SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+ (FrameReg == X86::EBP && VT == MVT::i32)) &&
+ "Invalid Frame Register!");
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -10990,6 +11485,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
@@ -10997,28 +11494,32 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
- DebugLoc dl = Op.getDebugLoc();
-
- SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
- Subtarget->is64Bit() ? X86::RBP : X86::EBP,
- getPointerTy());
- unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+ SDLoc dl (Op);
- SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
- DAG.getIntPtrConstant(RegInfo->getSlotSize()));
- StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+ EVT PtrVT = getPointerTy();
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+ unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+ (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+ "Invalid Frame Register!");
+ SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+ unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+ SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize()));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
- return DAG.getNode(X86ISD::EH_RETURN, dl,
- MVT::Other,
- Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+ return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+ DAG.getRegister(StoreAddrReg, PtrVT));
}
SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SelectionDAG &DAG) const {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
DAG.getVTList(MVT::i32, MVT::Other),
Op.getOperand(0), Op.getOperand(1));
@@ -11026,7 +11527,7 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
SelectionDAG &DAG) const {
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
Op.getOperand(0), Op.getOperand(1));
}
@@ -11041,7 +11542,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
@@ -11211,7 +11712,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
const TargetFrameLowering &TFI = *TM.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
EVT VT = Op.getValueType();
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
// Save FP Control Word to stack slot
int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
@@ -11224,7 +11725,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),
- Ops, 2, MVT::i16, MMO);
+ Ops, array_lengthof(Ops), MVT::i16,
+ MMO);
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -11257,7 +11759,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
@@ -11291,7 +11793,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
@@ -11315,7 +11817,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
unsigned NumBits = VT.getSizeInBits();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
Op = Op.getOperand(0);
// Issue a bsf (scan bits forward) which also sets EFLAGS.
@@ -11341,7 +11843,7 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
"Unsupported value type for operation");
unsigned NumElems = VT.getVectorNumElements();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
@@ -11377,7 +11879,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
@@ -11393,7 +11895,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
"Should not custom lower when pmuldq is available!");
// Extract the odd parts.
- const int UnpackMask[] = { 1, -1, 3, -1 };
+ static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
@@ -11407,7 +11909,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
- const int ShufMask[] = { 0, 4, 2, 6 };
+ static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
@@ -11453,7 +11955,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
EVT EltTy = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
SDValue N0 = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
// Lower sdiv X, pow2-const.
BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
@@ -11461,9 +11963,11 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
APInt SplatValue, SplatUndef;
- unsigned MinSplatBits;
+ unsigned SplatBitSize;
bool HasAnyUndefs;
- if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+ if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs) ||
+ EltTy.getSizeInBits() < SplatBitSize)
return SDValue();
if ((SplatValue != 0) &&
@@ -11491,16 +11995,13 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
-
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
EVT VT = Op.getValueType();
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- if (!Subtarget->hasSSE2())
- return SDValue();
-
// Optimize shl/srl/sra with constant shift amount.
if (isSplatVector(Amt.getNode())) {
SDValue SclrAmt = Amt->getOperand(0);
@@ -11611,6 +12112,224 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+ uint64_t ShiftAmt = 0;
+ for (unsigned i = 0; i != Ratio; ++i) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+ if (C == 0)
+ return SDValue();
+ // 6 == Log2(64)
+ ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+ }
+ // Check remaining shift amounts.
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ uint64_t ShAmt = 0;
+ for (unsigned j = 0; j != Ratio; ++j) {
+ ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+ if (C == 0)
+ return SDValue();
+ // 6 == Log2(64)
+ ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+ }
+ if (ShAmt != ShiftAmt)
+ return SDValue();
+ }
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ case ISD::SRL:
+ return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ case ISD::SRA:
+ return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
+ VT == MVT::v4i32 || VT == MVT::v8i16 ||
+ (Subtarget->hasInt256() &&
+ ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
+ VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ SDValue BaseShAmt;
+ EVT EltVT = VT.getVectorElementType();
+
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned i, j;
+ for (i = 0; i != NumElts; ++i) {
+ if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
+ continue;
+ break;
+ }
+ for (j = i; j != NumElts; ++j) {
+ SDValue Arg = Amt.getOperand(j);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ if (Arg != Amt.getOperand(i))
+ break;
+ }
+ if (i != NumElts && j == NumElts)
+ BaseShAmt = Amt.getOperand(i);
+ } else {
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ Amt = Amt.getOperand(0);
+ if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+ SDValue InVec = Amt.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = InVec.getValueType().getVectorNumElements();
+ unsigned i = 0;
+ for (; i != NumElts; ++i) {
+ SDValue Arg = InVec.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ BaseShAmt = Arg;
+ break;
+ }
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+ unsigned SplatIdx =
+ cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
+ if (C->getZExtValue() == SplatIdx)
+ BaseShAmt = InVec.getOperand(1);
+ }
+ }
+ if (BaseShAmt.getNode() == 0)
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
+ DAG.getIntPtrConstant(0));
+ }
+ }
+
+ if (BaseShAmt.getNode()) {
+ if (EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
+ }
+ case ISD::SRA:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
+ }
+ case ISD::SRL:
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
+ }
+ }
+ }
+ }
+
+ // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ if (!Subtarget->is64Bit() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+ Amt.getOpcode() == ISD::BITCAST &&
+ Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ Amt = Amt.getOperand(0);
+ unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+ VT.getVectorNumElements();
+ std::vector<SDValue> Vals(Ratio);
+ for (unsigned i = 0; i != Ratio; ++i)
+ Vals[i] = Amt.getOperand(i);
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ for (unsigned j = 0; j != Ratio; ++j)
+ if (Vals[j] != Amt.getOperand(i + j))
+ return SDValue();
+ }
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ case ISD::SHL:
+ return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
+ case ISD::SRL:
+ return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
+ case ISD::SRA:
+ return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+ SDValue V;
+
+ if (!Subtarget->hasSSE2())
+ return SDValue();
+
+ V = LowerScalarImmediateShift(Op, DAG, Subtarget);
+ if (V.getNode())
+ return V;
+
+ V = LowerScalarVariableShift(Op, DAG, Subtarget);
+ if (V.getNode())
+ return V;
+
+ // AVX2 has VPSLLV/VPSRAV/VPSRLV.
+ if (Subtarget->hasInt256()) {
+ if (Op.getOpcode() == ISD::SRL &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v4i64 || VT == MVT::v8i32))
+ return Op;
+ if (Op.getOpcode() == ISD::SHL &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v4i64 || VT == MVT::v8i32))
+ return Op;
+ if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
+ return Op;
+ }
+
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
@@ -11717,7 +12436,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue RHS = N->getOperand(1);
unsigned BaseOp = 0;
unsigned Cond = 0;
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown ovf instruction!");
case ISD::SADDO:
@@ -11784,7 +12503,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
EVT VT = Op.getValueType();
@@ -11827,62 +12546,31 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
// fall through
case MVT::v4i32:
case MVT::v8i16: {
- SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
- Op.getOperand(0), ShAmt, DAG);
+ // (sext (vzext x)) -> (vsext x)
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op00 = Op0.getOperand(0);
+ SDValue Tmp1;
+ // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
+ Tmp1 = LowerVectorIntExtend(Op00, DAG);
+ if (Tmp1.getNode()) {
+ SDValue Tmp1Op0 = Tmp1.getOperand(0);
+ assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+ "This optimization is invalid without a VZEXT.");
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+ }
+
+ // If the above didn't work, then just use Shift-Left + Shift-Right.
+ Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
}
}
}
-static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
-
- // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
- // There isn't any reason to disable it if the target processor supports it.
- if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
- SDValue Chain = Op.getOperand(0);
- SDValue Zero = DAG.getConstant(0, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(0, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i32), // Segment.
- Zero,
- Chain
- };
- SDNode *Res =
- DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
- array_lengthof(Ops));
- return SDValue(Res, 0);
- }
-
- unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
- if (!isDev)
- return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
-
- unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
- // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
- if (!Op1 && !Op2 && !Op3 && Op4)
- return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
-
- // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
- if (Op1 && !Op2 && !Op3 && !Op4)
- return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
-
- // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
- // (MFENCE)>;
- return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
-}
-
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
@@ -11908,9 +12596,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
Zero,
Chain
};
- SDNode *Res =
- DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
- array_lengthof(Ops));
+ SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
return SDValue(Res, 0);
}
@@ -11921,7 +12607,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
EVT T = Op.getValueType();
- DebugLoc DL = Op.getDebugLoc();
+ SDLoc DL(Op);
unsigned Reg = 0;
unsigned size = 0;
switch(T.getSimpleVT().SimpleTy) {
@@ -11944,7 +12630,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
- Ops, 5, T, MMO);
+ Ops, array_lengthof(Ops), T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
return cpOut;
@@ -11955,7 +12641,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
assert(Subtarget->is64Bit() && "Result not type legalized?");
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue TheChain = Op.getOperand(0);
- DebugLoc dl = Op.getDebugLoc();
+ SDLoc dl(Op);
SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
@@ -11966,7 +12652,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
rdx.getValue(1)
};
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
}
SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
@@ -11991,7 +12677,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
EVT T = Node->getValueType(0);
SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
DAG.getConstant(0, T), Node->getOperand(2));
@@ -12007,7 +12693,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
// Convert seq_cst store -> xchg
@@ -12050,9 +12736,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
}
if (!ExtraOp)
- return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
Op.getOperand(1));
- return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+ return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
Op.getOperand(1), Op.getOperand(2));
}
@@ -12060,8 +12746,9 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
- // which returns the values in two XMM registers.
- DebugLoc dl = Op.getDebugLoc();
+ // which returns the values as { float, float } (in XMM0) or
+ // { double, double } (which is returned in XMM0, XMM1).
+ SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -12075,14 +12762,16 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
Entry.isZExt = false;
Args.push_back(Entry);
+ bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
// the small struct {f32, f32} is returned in (eax, edx). For f64,
// the results are returned via SRet in memory.
- const char *LibcallName = (ArgVT == MVT::f64)
- ? "__sincos_stret" : "__sincosf_stret";
+ const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
- StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+ Type *RetTy = isF64
+ ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+ : (Type*)VectorType::get(ArgTy, 4);
TargetLowering::
CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
false, false, false, false, 0,
@@ -12090,7 +12779,18 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
/*doesNotRet=*/false, /*isReturnValueUsed*/true,
Callee, Args, DAG, dl);
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
- return CallResult.first;
+
+ if (isF64)
+ // Returned in xmm0 and xmm1.
+ return CallResult.first;
+
+ // Returned in bits 0:31 and 32:64 xmm0.
+ SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(0));
+ SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+ CallResult.first, DAG.getIntPtrConstant(1));
+ SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
}
/// LowerOperation - Provide custom lowering hooks for some operations.
@@ -12099,7 +12799,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
- case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
@@ -12182,7 +12881,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
static void ReplaceATOMIC_LOAD(SDNode *Node,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
// Convert wide load -> cmpxchg8b/cmpxchg16b
@@ -12203,7 +12902,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
static void
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG, unsigned NewOp) {
- DebugLoc dl = Node->getDebugLoc();
+ SDLoc dl(Node);
assert (Node->getValueType(0) == MVT::i64 &&
"Only know how to expand i64 atomics");
@@ -12216,7 +12915,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SDValue Ops[] = { Chain, In1, In2L, In2H };
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue Result =
- DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
cast<MemSDNode>(Node)->getMemOperand());
SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
@@ -12228,7 +12927,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
switch (N->getOpcode()) {
default:
@@ -12296,7 +12995,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
eax.getValue(2));
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
SDValue Ops[] = { eax, edx };
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
+ array_lengthof(Ops)));
Results.push_back(edx.getValue(1));
return;
}
@@ -12335,7 +13035,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
X86ISD::LCMPXCHG8_DAG;
SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
- Ops, 3, T, MMO);
+ Ops, array_lengthof(Ops), T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -12411,6 +13111,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SHLD: return "X86ISD::SHLD";
case X86ISD::SHRD: return "X86ISD::SHRD";
case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FANDN: return "X86ISD::FANDN";
case X86ISD::FOR: return "X86ISD::FOR";
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FSRL: return "X86ISD::FSRL";
@@ -12534,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
@@ -12547,6 +13249,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::RDSEED: return "X86ISD::RDSEED";
case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
@@ -12555,6 +13258,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::XTEST: return "X86ISD::XTEST";
}
}
@@ -12620,6 +13324,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return NumBits1 > NumBits2;
}
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<32>(Imm);
}
@@ -12671,6 +13389,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
@@ -14137,12 +14876,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
} else {
// __chkstk(MSVCRT): does not update stack pointer.
// Clobbers R10, R11 and EFLAGS.
- // FIXME: RAX(allocated size) might be reused and not killed.
BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
.addExternalSymbol("__chkstk")
.addReg(X86::RAX, RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to subtracted from RSP.
+ // RAX has the offset to be subtracted from RSP.
BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
.addReg(X86::RSP)
.addReg(X86::RAX);
@@ -14334,6 +15072,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
+
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
@@ -14379,6 +15120,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
@@ -14779,7 +15522,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget* Subtarget) {
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
@@ -14820,7 +15563,8 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ array_lengthof(Ops),
Ld->getMemoryVT(),
Ld->getPointerInfo(),
Ld->getAlignment(),
@@ -14874,7 +15618,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
EVT VT = N->getValueType(0);
// Don't create instructions with illegal types after legalize types has run.
@@ -14993,7 +15737,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
// All checks match so transform back to vector_shuffle so that DAG combiner
// can finish the job
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
@@ -15020,7 +15764,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
N->getValueType(0),
InputVector.getNode()->getOperand(0));
@@ -15065,7 +15809,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Ok, we've now decided to do the transformation.
- DebugLoc dl = InputVector.getDebugLoc();
+ SDLoc dl(InputVector);
// Store the value to a temporary stack slot.
SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
@@ -15176,7 +15920,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
SDValue Cond = N->getOperand(0);
// Get the LHS/RHS of the select.
SDValue LHS = N->getOperand(1);
@@ -15444,7 +16188,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
case ISD::SETLT:
case ISD::SETGT: {
ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
- Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
}
@@ -15512,6 +16256,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+ // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
+ if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
+ Cond.getOpcode() == ISD::SETCC) {
+
+ assert(Cond.getValueType().isVector() &&
+ "vector select expects a vector selector!");
+
+ EVT IntVT = Cond.getValueType();
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ if (!TValIsAllOnes && !FValIsAllZeros) {
+ // Try invert the condition if true value is not all 1s and false value
+ // is not all 0s.
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC =
+ ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ Cond.getOperand(0).getValueType().isInteger());
+ Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ if (TValIsAllOnes || FValIsAllZeros) {
+ SDValue Ret;
+
+ if (TValIsAllOnes && FValIsAllZeros)
+ Ret = Cond;
+ else if (TValIsAllOnes)
+ Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+ else if (FValIsAllZeros)
+ Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
+ DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+
+ return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+ }
+ }
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15572,6 +16361,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
SDValue SetCC;
const ConstantSDNode* C = 0;
bool needOppositeCond = (CC == X86::COND_E);
+ bool checkAgainstTrue = false; // Is it a comparison against 1?
if ((C = dyn_cast<ConstantSDNode>(Op1)))
SetCC = Op2;
@@ -15580,17 +16370,46 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
else // Quit if all operands are not constants.
return SDValue();
- if (C->getZExtValue() == 1)
+ if (C->getZExtValue() == 1) {
needOppositeCond = !needOppositeCond;
- else if (C->getZExtValue() != 0)
+ checkAgainstTrue = true;
+ } else if (C->getZExtValue() != 0)
// Quit if the constant is neither 0 or 1.
return SDValue();
- // Skip 'zext' node.
- if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
- SetCC = SetCC.getOperand(0);
+ bool truncatedToBoolWithAnd = false;
+ // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+ while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+ SetCC.getOpcode() == ISD::TRUNCATE ||
+ SetCC.getOpcode() == ISD::AND) {
+ if (SetCC.getOpcode() == ISD::AND) {
+ int OpIdx = -1;
+ ConstantSDNode *CS;
+ if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
+ CS->getZExtValue() == 1)
+ OpIdx = 1;
+ if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
+ CS->getZExtValue() == 1)
+ OpIdx = 0;
+ if (OpIdx == -1)
+ break;
+ SetCC = SetCC.getOperand(OpIdx);
+ truncatedToBoolWithAnd = true;
+ } else
+ SetCC = SetCC.getOperand(0);
+ }
switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC_CARRY:
+ // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+ // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+ // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+ // truncated to i1 using 'and'.
+ if (checkAgainstTrue && !truncatedToBoolWithAnd)
+ break;
+ assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+ "Invalid use of SETCC_CARRY!");
+ // FALL THROUGH
case X86ISD::SETCC:
// Set the condition code or opposite one if necessary.
CC = X86::CondCode(SetCC.getConstantOperandVal(0));
@@ -15606,9 +16425,15 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
return SDValue();
// Quit if false value is not a constant.
if (!FVal) {
- // A special case for rdrand, where 0 is set if false cond is found.
SDValue Op = SetCC.getOperand(0);
- if (Op.getOpcode() != X86ISD::RDRAND)
+ // Skip 'zext' or 'trunc' node.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+ Op.getOpcode() == ISD::TRUNCATE)
+ Op = Op.getOperand(0);
+ // A special case for rdrand/rdseed, where 0 is set if false cond is
+ // found.
+ if ((Op.getOpcode() != X86ISD::RDRAND &&
+ Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
return SDValue();
}
// Quit if false value is not the constant 0 or 1.
@@ -15639,7 +16464,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// If the flag operand isn't dead, don't touch this CMOV.
if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
@@ -15842,7 +16667,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
}
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
if (isPowerOf2_64(MulAmt2) &&
!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -15892,7 +16717,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
APInt ShAmt = N1C->getAPIntValue();
Mask = Mask.shl(ShAmt);
if (Mask != 0)
- return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+ return DAG.getNode(ISD::AND, SDLoc(N), VT,
N00, DAG.getConstant(Mask, VT));
}
}
@@ -15908,136 +16733,61 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
// hardware support for this operation. This is better expressed as an ADD
// of two values.
if (N1C && (1 == N1C->getZExtValue())) {
- return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
}
return SDValue();
}
-/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
-/// when possible.
-static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
- if (N->getOpcode() == ISD::SHL) {
- SDValue V = PerformSHLCombine(N, DAG);
- if (V.getNode()) return V;
- }
-
- // On X86 with SSE2 support, we can transform this to a vector shift if
- // all elements are shifted by the same amount. We can't do this in legalize
- // because the a constant vector is typically transformed to a constant pool
- // so we have no knowledge of the shift amount.
- if (!Subtarget->hasSSE2())
- return SDValue();
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
(!Subtarget->hasInt256() ||
(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
return SDValue();
- SDValue ShAmtOp = N->getOperand(1);
- EVT EltVT = VT.getVectorElementType();
- DebugLoc DL = N->getDebugLoc();
- SDValue BaseShAmt = SDValue();
- if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned i = 0;
- for (; i != NumElts; ++i) {
- SDValue Arg = ShAmtOp.getOperand(i);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- BaseShAmt = Arg;
- break;
- }
- // Handle the case where the build_vector is all undef
- // FIXME: Should DAG allow this?
- if (i == NumElts)
- return SDValue();
+ SDValue Amt = N->getOperand(1);
+ SDLoc DL(N);
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+ APInt ShiftAmt = C->getAPIntValue();
+ unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
- for (; i != NumElts; ++i) {
- SDValue Arg = ShAmtOp.getOperand(i);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- if (Arg != BaseShAmt) {
- return SDValue();
- }
+ // SSE2/AVX2 logical shifts always return a vector of 0s
+ // if the shift amount is bigger than or equal to
+ // the element size. The constant shift amount will be
+ // encoded as a 8-bit immediate.
+ if (ShiftAmt.trunc(8).uge(MaxAmount))
+ return getZeroVector(VT, Subtarget, DAG, DL);
}
- } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
- cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
- SDValue InVec = ShAmtOp.getOperand(0);
- if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = InVec.getValueType().getVectorNumElements();
- unsigned i = 0;
- for (; i != NumElts; ++i) {
- SDValue Arg = InVec.getOperand(i);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- BaseShAmt = Arg;
- break;
- }
- } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
- unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
- if (C->getZExtValue() == SplatIdx)
- BaseShAmt = InVec.getOperand(1);
- }
- }
- if (BaseShAmt.getNode() == 0) {
- // Don't create instructions with illegal types after legalize
- // types has run.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
- !DCI.isBeforeLegalize())
- return SDValue();
+ }
- BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
- DAG.getIntPtrConstant(0));
- }
- } else
- return SDValue();
+ return SDValue();
+}
- // The shift amount is an i32.
- if (EltVT.bitsGT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
- else if (EltVT.bitsLT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
+/// PerformShiftCombine - Combine shifts.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (N->getOpcode() == ISD::SHL) {
+ SDValue V = PerformSHLCombine(N, DAG);
+ if (V.getNode()) return V;
+ }
- // The shift amount is identical so we can do a vector shift.
- SDValue ValOp = N->getOperand(0);
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Unknown shift opcode!");
- case ISD::SHL:
- switch (VT.getSimpleVT().SimpleTy) {
- default: return SDValue();
- case MVT::v2i64:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v4i64:
- case MVT::v8i32:
- case MVT::v16i16:
- return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
- }
- case ISD::SRA:
- switch (VT.getSimpleVT().SimpleTy) {
- default: return SDValue();
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v8i32:
- case MVT::v16i16:
- return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
- }
- case ISD::SRL:
- switch (VT.getSimpleVT().SimpleTy) {
- default: return SDValue();
- case MVT::v2i64:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v4i64:
- case MVT::v8i32:
- case MVT::v16i16:
- return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
- }
+ if (N->getOpcode() != ISD::SRA) {
+ // Try to fold this logical shift into a zero vector.
+ SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
+ if (V.getNode()) return V;
}
+
+ return SDValue();
}
// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
@@ -16055,7 +16805,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0->getOperand(1);
SDValue CMP1 = N1->getOperand(1);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
@@ -16174,7 +16924,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue N0 = Narrow->getOperand(0);
SDValue N1 = Narrow->getOperand(1);
- DebugLoc DL = Narrow->getDebugLoc();
+ SDLoc DL(Narrow);
// The Left side has to be a trunc.
if (N0.getOpcode() != ISD::TRUNCATE)
@@ -16246,7 +16996,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// Check LHS for neg
if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
@@ -16280,7 +17030,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
@@ -16348,17 +17098,23 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// Validate that the Mask operand is a vector sra node.
// FIXME: what to do for bytes, since there is a psignb/pblendvb, but
// there is no psrai.b
- if (Mask.getOpcode() != X86ISD::VSRAI)
- return SDValue();
-
- // Check that the SRA is all signbits.
- SDValue SraC = Mask.getOperand(1);
- unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ unsigned SraAmt = ~0;
+ if (Mask.getOpcode() == ISD::SRA) {
+ SDValue Amt = Mask.getOperand(1);
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
+ SraAmt = C->getZExtValue();
+ }
+ } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+ SDValue SraC = Mask.getOperand(1);
+ SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+ }
if ((SraAmt + 1) != EltBits)
return SDValue();
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// Now we know we at least have a plendvb with the mask val. See if
// we can form a psignb/w/d.
@@ -16407,7 +17163,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
if (ShAmt1.getOpcode() == ISD::TRUNCATE)
ShAmt1 = ShAmt1.getOperand(0);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
unsigned Opc = X86ISD::SHLD;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
@@ -16454,7 +17210,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
// and change it to SUB and CMOV.
@@ -16504,7 +17260,7 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
// Create BLSMSK instructions by finding X ^ (X-1)
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
isAllOnes(N0.getOperand(1)))
@@ -16524,15 +17280,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
- DebugLoc dl = Ld->getDebugLoc();
+ SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
+ // On Sandybridge unaligned 256bit loads are inefficient.
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
- bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
-
- // On Sandybridge unaligned 256bit loads are inefficient.
+ bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
@@ -16552,7 +17307,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->isInvariant(),
- std::max(Alignment/2U, 1U));
+ std::min(16U, Alignment));
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1),
Load2.getValue(1));
@@ -16720,16 +17475,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
- DebugLoc dl = St->getDebugLoc();
+ SDLoc dl(St);
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned Alignment = St->getAlignment();
- bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
// If we are saving a concatenation of two XMM registers, perform two stores.
// On Sandy Bridge, 256-bit memory operations are executed by two
// 128-bit ports. However, on Haswell it is better to issue a single 256-bit
// memory operation.
+ unsigned Alignment = St->getAlignment();
+ bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
if (VT.is256BitVector() && !Subtarget->hasInt256() &&
StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements();
@@ -16749,7 +17504,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
St->getPointerInfo(), St->isVolatile(),
St->isNonTemporal(),
- std::max(Alignment/2U, 1U));
+ std::min(16U, Alignment));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
@@ -16883,8 +17638,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
return SDValue();
- DebugLoc LdDL = Ld->getDebugLoc();
- DebugLoc StDL = N->getDebugLoc();
+ SDLoc LdDL(Ld);
+ SDLoc StDL(N);
// If we are a 64-bit capable x86, lower to a single movq load/store pair.
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
@@ -16977,7 +17732,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
- EVT VT = LHS.getValueType();
+ MVT VT = LHS.getValueType().getSimpleVT();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
@@ -17048,23 +17803,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// LHS = VECTOR_SHUFFLE A, B, LMask
// RHS = VECTOR_SHUFFLE A, B, RMask
// Check that the masks correspond to performing a horizontal operation.
- for (unsigned i = 0; i != NumElts; ++i) {
- int LIdx = LMask[i], RIdx = RMask[i];
-
- // Ignore any UNDEF components.
- if (LIdx < 0 || RIdx < 0 ||
- (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
- (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
- continue;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+ // Ignore any UNDEF components.
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
- // Check that successive elements are being operated on. If not, this is
- // not a horizontal operation.
- unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
- unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
- int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
- if (!(LIdx == Index && RIdx == Index + 1) &&
- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
- return false;
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+ return false;
+ }
}
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
@@ -17083,7 +17839,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, true))
- return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+ return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
return SDValue();
}
@@ -17098,7 +17854,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, false))
- return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+ return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
return SDValue();
}
@@ -17135,7 +17891,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
}
- return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+ return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1));
}
@@ -17152,6 +17908,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+ // FANDN(x, 0.0) -> 0.0
+ // FANDN(0.0, x) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ return SDValue();
+}
+
static SDValue PerformBTCombine(SDNode *N,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
@@ -17179,12 +17948,12 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
VT.getVectorElementType().getSizeInBits() ==
OpVT.getVectorElementType().getSizeInBits()) {
- return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+ return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
return SDValue();
}
-static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
@@ -17193,7 +17962,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
// both SSE and AVX2 since there is no sign-extended shift right
@@ -17204,14 +17973,14 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
N0.getOpcode() == ISD::SIGN_EXTEND)) {
SDValue N00 = N0.getOperand(0);
- // EXTLOAD has a better solution on AVX2,
+ // EXTLOAD has a better solution on AVX2,
// it may be replaced with X86ISD::VSEXT node.
if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
- SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+ SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
N00, N1);
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
}
@@ -17240,7 +18009,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget* Subtarget) {
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
EVT VT = N->getValueType(0);
// Let legalize expand this if it isn't a legal type yet.
@@ -17285,7 +18054,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
// (and (i32 x86isd::setcc_carry), 1)
// This eliminates the zext. This transformation is necessary because
// ISD::SETCC is always legalized to i8.
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -17323,17 +18092,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
LHS.getValueType(), RHS, LHS.getOperand(1));
- return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0),
addV, DAG.getConstant(0, addV.getValueType()), CC);
}
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
RHS.getValueType(), LHS, RHS.getOperand(1));
- return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ return DAG.getSetCC(SDLoc(N), N->getValueType(0),
addV, DAG.getConstant(0, addV.getValueType()), CC);
}
return SDValue();
@@ -17342,7 +18111,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, MVT::i8,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
@@ -17353,7 +18122,7 @@ static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
@@ -17367,7 +18136,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
@@ -17397,7 +18166,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
SDValue EFLAGS = N->getOperand(3);
@@ -17422,7 +18191,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
// SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
@@ -17457,7 +18226,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
// We don't have a good way to replace an EFLAGS use, so only do this when
// dead right now.
SDValue(N, 1).use_empty()) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
@@ -17476,7 +18245,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
// (sub (sete X, 0), Y) -> sbb 0, Y
// (sub (setne X, 0), Y) -> adc -1, Y
static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
- DebugLoc DL = N->getDebugLoc();
+ SDLoc DL(N);
// Look through ZExts.
SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
@@ -17522,7 +18291,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
(Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
- return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+ return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
return OptimizeConditionalInDecrement(N, DAG);
}
@@ -17542,10 +18311,10 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
isa<ConstantSDNode>(Op1.getOperand(1))) {
APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
EVT VT = Op0.getValueType();
- SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+ SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
DAG.getConstant(~XorC, VT));
- return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+ return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
DAG.getConstant(C->getAPIntValue()+1, VT));
}
}
@@ -17555,7 +18324,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
(Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
- return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+ return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
return OptimizeConditionalInDecrement(N, DAG);
}
@@ -17572,7 +18341,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
if (In.getOpcode() != X86ISD::VZEXT)
return SDValue();
- return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+ return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
In.getOperand(0));
}
@@ -17606,6 +18375,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMIN:
case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
case ISD::ANY_EXTEND:
@@ -18132,7 +18902,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
getTargetMachine())))
return;
- Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+ Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
GA->getValueType(0), Offset);
break;
}
@@ -18147,7 +18917,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::pair<unsigned, const TargetRegisterClass*>
X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
- EVT VT) const {
+ MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
if (Constraint.size() == 1) {
@@ -18214,7 +18984,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget->hasSSE1()) break;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
@@ -18239,6 +19009,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v8f32:
case MVT::v4f64:
return std::make_pair(0U, &X86::VR256RegClass);
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ return std::make_pair(0U, &X86::VR512RegClass);
}
break;
}
@@ -18349,7 +19124,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
} else if (Res.second == &X86::FR32RegClass ||
Res.second == &X86::FR64RegClass ||
- Res.second == &X86::VR128RegClass) {
+ Res.second == &X86::VR128RegClass ||
+ Res.second == &X86::VR256RegClass ||
+ Res.second == &X86::FR32XRegClass ||
+ Res.second == &X86::FR64XRegClass ||
+ Res.second == &X86::VR128XRegClass ||
+ Res.second == &X86::VR256XRegClass ||
+ Res.second == &X86::VR512RegClass) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
@@ -18363,6 +19144,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.second = &X86::VR128RegClass;
else if (X86::VR256RegClass.hasType(VT))
Res.second = &X86::VR256RegClass;
+ else if (X86::VR512RegClass.hasType(VT))
+ Res.second = &X86::VR512RegClass;
}
return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index da1dad0..2703274 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -53,6 +53,10 @@ namespace llvm {
/// to X86::XORPS or X86::XORPD.
FXOR,
+ /// FAND - Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
/// FSRL - Bitwise logical right shift of floating point values. These
/// corresponds to X86::PSRLDQ.
FSRL,
@@ -290,6 +294,10 @@ namespace llvm {
// TESTP - Vector packed fp sign bitwise comparisons
TESTP,
+ // OR/AND test for masks
+ KORTEST,
+ KTEST,
+
// Several flavors of instructions with vector shuffle behaviors.
PALIGNR,
PSHUFD,
@@ -313,6 +321,8 @@ namespace llvm {
VPERMI,
VPERM2X128,
VBROADCAST,
+ // masked broadcast
+ VBROADCASTM,
// PMULUDQ - Vector multiply packed unsigned doubleword integers
PMULUDQ,
@@ -356,10 +366,17 @@ namespace llvm {
// RDRAND - Get a random integer and indicate whether it is valid in CF.
RDRAND,
+ // RDSEED - Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
// PCMP*STRI
PCMPISTRI,
PCMPESTRI,
+ // XTEST - Test if in transactional execution.
+ XTEST,
+
// ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
// ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
// Atomic 64-bit binary operations.
@@ -427,25 +444,45 @@ namespace llvm {
/// Define some predicates that are used for node matching.
namespace X86 {
- /// isVEXTRACTF128Index - Return true if the specified
+ /// isVEXTRACT128Index - Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+ bool isVEXTRACT128Index(SDNode *N);
+
+ /// isVINSERT128Index - Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+ bool isVINSERT128Index(SDNode *N);
+
+ /// isVEXTRACT256Index - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
- /// suitable for input to VEXTRACTF128.
- bool isVEXTRACTF128Index(SDNode *N);
+ /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+ bool isVEXTRACT256Index(SDNode *N);
- /// isVINSERTF128Index - Return true if the specified
+ /// isVINSERT256Index - Return true if the specified
/// INSERT_SUBVECTOR operand specifies a subvector insert that is
- /// suitable for input to VINSERTF128.
- bool isVINSERTF128Index(SDNode *N);
+ /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+ bool isVINSERT256Index(SDNode *N);
- /// getExtractVEXTRACTF128Immediate - Return the appropriate
+ /// getExtractVEXTRACT128Immediate - Return the appropriate
/// immediate to extract the specified EXTRACT_SUBVECTOR index
- /// with VEXTRACTF128 instructions.
- unsigned getExtractVEXTRACTF128Immediate(SDNode *N);
+ /// with VEXTRACTF128, VEXTRACTI128 instructions.
+ unsigned getExtractVEXTRACT128Immediate(SDNode *N);
- /// getInsertVINSERTF128Immediate - Return the appropriate
+ /// getInsertVINSERT128Immediate - Return the appropriate
/// immediate to insert at the specified INSERT_SUBVECTOR index
- /// with VINSERTF128 instructions.
- unsigned getInsertVINSERTF128Immediate(SDNode *N);
+ /// with VINSERTF128, VINSERT128 instructions.
+ unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+ /// getExtractVEXTRACT256Immediate - Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+ unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+ /// getInsertVINSERT256Immediate - Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF64x4, VINSERTI64x4 instructions.
+ unsigned getInsertVINSERT256Immediate(SDNode *N);
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
@@ -504,7 +541,7 @@ namespace llvm {
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
virtual EVT
- getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const;
@@ -556,7 +593,7 @@ namespace llvm {
virtual const char *getTargetNodeName(unsigned Opcode) const;
/// getSetCCResultType - Return the value type to use for ISD::SETCC.
- virtual EVT getSetCCResultType(EVT VT) const;
+ virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
/// computeMaskedBitsForTargetNode - Determine which of the bits specified
/// in Mask are known to be either zero or one and return them in the
@@ -603,7 +640,7 @@ namespace llvm {
/// error, this returns a register number of 0.
std::pair<unsigned, const TargetRegisterClass*>
getRegForInlineAsmConstraint(const std::string &Constraint,
- EVT VT) const;
+ MVT VT) const;
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
@@ -627,6 +664,8 @@ namespace llvm {
virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+ virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
/// isZExtFree - Return true if any actual instruction that defines a
/// value of type Ty1 implicit zero-extends the value to Ty2 in the result
/// register. This does not necessarily include registers defined in
@@ -639,11 +678,11 @@ namespace llvm {
virtual bool isZExtFree(EVT VT1, EVT VT2) const;
virtual bool isZExtFree(SDValue Val, EVT VT2) const;
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
/// isNarrowingProfitable - Return true if it's profitable to narrow
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
@@ -716,6 +755,9 @@ namespace llvm {
SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
SelectionDAG &DAG) const;
+ /// \brief Reset the operation actions based on target options.
+ virtual void resetOperationActions();
+
protected:
std::pair<const TargetRegisterClass*, uint8_t>
findRepresentativeClass(MVT VT) const;
@@ -724,9 +766,12 @@ namespace llvm {
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- const X86RegisterInfo *RegInfo;
const DataLayout *TD;
+ /// Used to store the TargetOptions so that we don't waste time resetting
+ /// the operation actions unless we have to.
+ TargetOptions TO;
+
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
/// floating point ops.
/// When SSE is available, use it for f32 operations.
@@ -746,16 +791,16 @@ namespace llvm {
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
SDValue LowerMemArgument(SDValue Chain,
CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &ArgInfo,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
const CCValAssign &VA, MachineFrameInfo *MFI,
unsigned i) const;
SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const;
@@ -777,7 +822,7 @@ namespace llvm {
bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
SDValue Chain, bool IsTailCall, bool Is64Bit,
- int FPDiff, DebugLoc dl) const;
+ int FPDiff, SDLoc dl) const;
unsigned GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG &DAG) const;
@@ -786,15 +831,16 @@ namespace llvm {
bool isSigned,
bool isReplace) const;
- SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+ SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
int64_t Offset, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -808,7 +854,9 @@ namespace llvm {
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
@@ -816,7 +864,7 @@ namespace llvm {
SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerToBT(SDValue And, ISD::CondCode CC,
- DebugLoc dl, SelectionDAG &DAG) const;
+ SDLoc dl, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -851,7 +899,7 @@ namespace llvm {
LowerFormalArguments(SDValue Chain,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+ SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
virtual SDValue
LowerCall(CallLoweringInfo &CLI,
@@ -862,7 +910,7 @@ namespace llvm {
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
- DebugLoc dl, SelectionDAG &DAG) const;
+ SDLoc dl, SelectionDAG &DAG) const;
virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index bb362f5..ba1aede 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -84,13 +84,16 @@ defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">;
defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
-def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>;
-def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
- "prefetch\t$addr", []>;
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+ "prefetch\t$addr",
+ [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
-def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
- "prefetchw\t$addr", []>;
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+ [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+ Requires<[HasPrefetchW]>;
// "3DNowA" instructions
defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..8abae14
--- /dev/null
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,715 @@
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+ def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+// -- 32x8 form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+ "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, f128mem:$src2, i8imm:$src3),
+ "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+}
+
+// -- 64x4 fp form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in {
+def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+ "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+ "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+// -- 32x4 integer form --
+let neverHasSideEffects = 1 in {
+def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+ "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i128mem:$src2, i8imm:$src3),
+ "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+}
+
+let neverHasSideEffects = 1 in {
+// -- 64x4 form --
+def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+ "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+ "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2),
+ (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2),
+ (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2),
+ (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
+ (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
+ (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2),
+ (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2),
+ (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2),
+ (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2),
+ (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2),
+ (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2),
+ (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2),
+ (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2),
+ (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2),
+ (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
+ (bc_v8i32 (loadv4i64 addr:$src2)),
+ (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
+ "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+ (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
+ "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+// -- 32x4 form --
+def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512;
+def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+let neverHasSideEffects = 1 in {
+// -- 32x4 form --
+def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512;
+def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+ (v4f32 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+ (v2f64 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+ (v2i64 (VEXTRACTI32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+
+def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+ (v8f32 (VEXTRACTF64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)),
+ (v8i32 (VEXTRACTI64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+ (v4f64 (VEXTRACTF64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+ (v4i64 (VEXTRACTI64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+// A 256-bit subvector extract from the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+
+// zmm -> xmm
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+ (ins VR128X:$src1, u32u8imm:$src2),
+ "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
+ "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+ addr:$dst)]>, EVEX;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr,
+ RegisterClass DestRC,
+ RegisterClass SrcRC, X86MemOperand x86memop> {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX;
+ def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX;
+}
+let ExeDomain = SSEPackedSingle in {
+ defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512,
+ VR128X, f32mem>,
+ EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512,
+ VR128X, f64mem>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VBROADCASTSDZrm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
+ RegisterClass SrcRC, RegisterClass KRC> {
+ def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, EVEX_V512;
+ def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst),
+ (ins KRC:$mask, SrcRC:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+ []>, EVEX, EVEX_V512, EVEX_KZ;
+}
+
+defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
+ VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+ (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+ (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+ (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+ (VPBROADCASTQrZrr GR64:$src)>;
+
+multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
+ RegisterClass KRC> {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
+ def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
+ VR128X:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
+ EVEX, EVEX_KZ;
+ def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
+ def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
+ x86memop:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+ [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask,
+ (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+}
+
+defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
+ loadi32, VR512, v16i32, v4i32, VK16WM>,
+ EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
+ loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
+ (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
+ (VBROADCASTSDZrr VR128X:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+ (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+ (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
+ (EXTRACT_SUBREG
+ (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ addr:$src)), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ RegisterClass DstRC, RegisterClass KRC,
+ ValueType OpVT, ValueType SrcVT> {
+def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX;
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
+ VK16, v16i32, v16i1>, EVEX_V512;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
+ VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+ string OpcodeStr, RegisterClass KRC,
+ ValueType vt, X86MemOperand x86memop> {
+ let neverHasSideEffects = 1 in {
+ def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ let mayLoad = 1 in
+ def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (vt (load addr:$src)))]>;
+ let mayStore = 1 in
+ def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+ string OpcodeStr,
+ RegisterClass KRC, RegisterClass GRC> {
+ let neverHasSideEffects = 1 in {
+ def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+let Predicates = [HasAVX512] in {
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+ VEX, TB;
+ defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+ VEX, TB;
+}
+
+let Predicates = [HasAVX512] in {
+ // GR16 from/to 16-bit mask
+ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+ def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+
+ // Store kreg in memory
+ def : Pat<(store (v16i1 VK16:$src), addr:$dst),
+ (KMOVWmk addr:$dst, VK16:$src)>;
+
+ def : Pat<(store (v8i1 VK8:$src), addr:$dst),
+ (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>;
+}
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ // GR from/to 8-bit mask without native support
+ def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ VK8)>;
+ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (EXTRACT_SUBREG
+ (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+ sub_8bit)>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TB;
+}
+
+defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
+
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+def : Pat<(not VK8:$src),
+ (COPY_TO_REGCLASS
+ (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+// Mask binary operation
+// - KADD, KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX_4V, VEX_L, TB;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+let isCommutable = 1 in {
+ defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>;
+ defm KAND : avx512_mask_binop_w<0x41, "kand", and>;
+ let isCommutable = 0 in
+ defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
+ defm KOR : avx512_mask_binop_w<0x45, "kor", or>;
+ defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>;
+ defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>;
+}
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+ VK16:$src1, VK16:$src2),
+ (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>;
+}
+
+defm : avx512_mask_binop_int<"kadd", "KADD">;
+defm : avx512_mask_binop_int<"kand", "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor", "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor", "KXOR">;
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(OpNode VK8:$src1, VK8:$src2),
+ (COPY_TO_REGCLASS
+ (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+}
+
+defm : avx512_binop_pat<and, KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or, KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC1, RegisterClass KRC2> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+}
+
+multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
+ defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>,
+ VEX_4V, VEX_L, OpSize, TB;
+}
+
+defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
+
+multiclass avx512_mask_unpck_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+ VK8:$src1, VK8:$src2),
+ (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>;
+}
+
+defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">;
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512], Defs = [EFLAGS] in
+ def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TB;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512] in
+ def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
+ !strconcat(OpcodeStr,
+ "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+ SDNode OpNode> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, OpSize, TA, VEX_W;
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+ let Predicates = [HasAVX512] in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+ def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+ defm B : avx512_mask_setop<VK8, v8i1, Val>;
+ defm W : avx512_mask_setop<VK16, v16i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+ (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+ (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f406416..9ce02ba 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -294,7 +294,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
// unsigned division/remainder
let hasSideEffects = 1 in { // so that we don't speculatively execute
let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
"div{b}\t$src", [], IIC_DIV8_REG>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -310,7 +310,7 @@ def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
} // SchedRW
let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
"div{b}\t$src", [], IIC_DIV8_MEM>,
SchedLoadReg<WriteIDivLd>;
@@ -331,7 +331,7 @@ def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
// Signed division/remainder.
let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
"idiv{b}\t$src", [], IIC_IDIV8>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -347,7 +347,7 @@ def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
} // SchedRW
let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
"idiv{b}\t$src", [], IIC_IDIV8>,
SchedLoadReg<WriteIDivLd>;
@@ -932,7 +932,8 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
Format f, list<dag> pattern>
: ITy<0x82, f, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> {
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+ Sched<[WriteALULd, WriteRMW]> {
let ImmT = Imm8; // Always 8-bit immediate.
}
@@ -959,18 +960,26 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
[(set EFLAGS, (opnode (load addr:$dst),
typeinfo.Imm8Operator:$src))]>;
-// BinOpAI - Instructions like "add %eax, %eax, imm".
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
Register areg, string operands>
: ITy<opcode, RawFrm, typeinfo,
(outs), (ins typeinfo.ImmOperand:$src),
- mnemonic, operands, []> {
+ mnemonic, operands, []>, Sched<[WriteALU]> {
let ImmT = typeinfo.ImmEncoding;
let Uses = [areg];
- let Defs = [areg];
+ let Defs = [areg, EFLAGS];
let hasSideEffects = 0;
}
+// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+ let Uses = [areg, EFLAGS];
+}
+
/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
/// defined with "(set GPR:$dst, EFLAGS, (...".
///
@@ -1029,16 +1038,16 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
-
- def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -1051,7 +1060,7 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
string mnemonic, Format RegMRM, Format MemMRM,
SDNode opnode, bit CommutableRR,
bit ConvertibleToThreeAddress> {
- let Defs = [EFLAGS] in {
+ let Uses = [EFLAGS], Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst" in {
let isCommutable = CommutableRR,
isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
@@ -1100,16 +1109,16 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
-
- def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1167,16 +1176,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
-
- def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ } // Defs = [EFLAGS]
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
}
@@ -1194,12 +1203,10 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
}
// Arithmetic.
-let Uses = [EFLAGS] in {
- defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
- 1, 0>;
- defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
- 0, 0>;
-}
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+ 1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+ 0, 0>;
let isCompare = 1 in {
defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
@@ -1214,44 +1221,46 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
(X86cmp (and_su node:$lhs, node:$rhs), 0)>;
-let isCompare = 1, Defs = [EFLAGS] in {
- let isCommutable = 1 in {
- def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
- def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
- def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
- def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
- } // isCommutable
-
- def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
- def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
- def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
- def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
-
- def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
- def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
- def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
- def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
-
- def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
- def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
- def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
- def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+let isCompare = 1 in {
+ let Defs = [EFLAGS] in {
+ let isCommutable = 1 in {
+ def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+ def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+ def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+ def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+ } // isCommutable
+
+ def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+ def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+ def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+ def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+ def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+ def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+ def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+ def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+ // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+ // register class is constrained to GR8_NOREX.
+ let isPseudo = 1 in
+ def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+ "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+ } // Defs = [EFLAGS]
def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL,
- "{$src, %al|AL, $src}">;
+ "{$src, %al|al, $src}">;
def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX,
- "{$src, %ax|AX, $src}">;
+ "{$src, %ax|ax, $src}">;
def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
+ "{$src, %eax|eax, $src}">;
def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
-
- // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
- // register class is constrained to GR8_NOREX.
- let isPseudo = 1 in
- def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
- "", [], IIC_BIN_NONMEM>;
-}
+ "{$src, %rax|rax, $src}">;
+} // isCompare
//===----------------------------------------------------------------------===//
// ANDN Instruction
@@ -1293,12 +1302,12 @@ let neverHasSideEffects = 1 in {
let isCommutable = 1 in
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V;
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V;
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
}
}
@@ -1313,6 +1322,7 @@ let Predicates = [HasBMI2] in {
// ADCX Instruction
//
let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"adcx{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_NONMEM>, T8, OpSize;
@@ -1320,12 +1330,13 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"adcx{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+ } // SchedRW
- let mayLoad = 1 in {
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"adcx{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8, OpSize;
-
+
def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"adcx{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
@@ -1336,6 +1347,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
// ADOX Instruction
//
let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ let SchedRW = [WriteALU] in {
def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"adox{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_NONMEM>, T8XS;
@@ -1343,12 +1355,13 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"adox{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+ } // SchedRW
- let mayLoad = 1 in {
+ let mayLoad = 1, SchedRW = [WriteALULd] in {
def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"adox{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8XS;
-
+
def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"adox{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 8f2d0a1..a967a4d 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -16,7 +16,7 @@
// SetCC instructions.
multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
- isCommutable = 1 in {
+ isCommutable = 1, SchedRW = [WriteALU] in {
def NAME#16rr
: I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
@@ -37,7 +37,8 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
IIC_CMOV32_RR>, TB;
}
- let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in {
+ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+ SchedRW = [WriteALULd, ReadAfterLd] in {
def NAME#16rm
: I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
@@ -83,11 +84,11 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
def r : I<opc, MRM0r, (outs GR8:$dst), (ins),
!strconcat(Mnemonic, "\t$dst"),
[(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
- IIC_SET_R>, TB;
+ IIC_SET_R>, TB, Sched<[WriteALU]>;
def m : I<opc, MRM0m, (outs), (ins i8mem:$dst),
!strconcat(Mnemonic, "\t$dst"),
[(store (X86setcc OpNode, EFLAGS), addr:$dst)],
- IIC_SET_M>, TB;
+ IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
} // Uses = [EFLAGS]
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 734e598..8969946 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
// The MSVC runtime contains an _ftol2 routine for converting floating-point
// to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. No
-// other registers (aside from flags) are touched.
+// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
+// used as a temporary register. No other registers (aside from flags) are
+// touched.
// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
// variant is unnecessary.
-let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
"# win32 fptoui",
[(X86WinFTOL RFP32:$src)]>,
@@ -149,11 +150,12 @@ let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
//===----------------------------------------------------------------------===//
// EH Pseudo Instructions
//
+let SchedRW = [WriteSystem] in {
let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, isCodeGenOnly = 1 in {
def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
"ret\t#eh_return, addr: $addr",
- [(X86ehret GR32:$addr)], IIC_RET>;
+ [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
}
@@ -161,7 +163,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, isCodeGenOnly = 1 in {
def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
"ret\t#eh_return, addr: $addr",
- [(X86ehret GR64:$addr)], IIC_RET>;
+ [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
}
@@ -186,6 +188,7 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
Requires<[In64BitMode]>;
}
}
+} // SchedRW
let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
@@ -214,50 +217,41 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
// Alias Instructions
//===----------------------------------------------------------------------===//
-// Alias instructions that map movr0 to xor.
+// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
// FIXME: Set encoding to pseudo.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
- isCodeGenOnly = 1 in {
-def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
- [(set GR8:$dst, 0)], IIC_ALU_NONMEM>;
-
-// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
-// encoding and avoids a partial-register update sometimes, but doing so
-// at isel time interferes with rematerialization in the current register
-// allocator. For now, this is rewritten when the instruction is lowered
-// to an MCInst.
-def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
- "",
- [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize;
-
-// FIXME: Set encoding to pseudo.
+ isCodeGenOnly = 1 in
def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, 0)], IIC_ALU_NONMEM>;
+ [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+ let AddedComplexity = 20;
}
-// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
-// smaller encoding, but doing so at isel time interferes with rematerialization
-// in the current register allocator. For now, this is rewritten when the
-// instruction is lowered to an MCInst.
-// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
-// when we have a better way to specify isel priority.
-let Defs = [EFLAGS], isCodeGenOnly=1,
- AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
- [(set GR64:$dst, 0)], IIC_ALU_NONMEM>;
-
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
- isCodeGenOnly = 1 in
-def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
- "", [(set GR64:$dst, i64immZExt32:$src)],
- IIC_ALU_NONMEM>;
+ isCodeGenOnly = 1, neverHasSideEffects = 1 in
+def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
+ "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant, and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
// Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1 in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
// However, Pat<> can't replicate the destination reg into the inputs of the
// result.
@@ -320,6 +314,7 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
//===----------------------------------------------------------------------===//
// String Pseudo Instructions
//
+let SchedRW = [WriteMicrocoded] in {
let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
[(X86rep_movs i8)], IIC_REP_MOVS>, REP,
@@ -382,6 +377,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
[(X86rep_stos i64)], IIC_REP_STOS>, REP,
Requires<[In64BitMode]>;
}
+} // SchedRW
//===----------------------------------------------------------------------===//
// Thread Local Storage Instructions
@@ -594,12 +590,13 @@ defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
let isCodeGenOnly = 1, Defs = [EFLAGS] in
def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
"or{l}\t{$zero, $dst|$dst, $zero}",
- [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK;
+ [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK,
+ Sched<[WriteALULd, WriteRMW]>;
let hasSideEffects = 1 in
def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
"#MEMBARRIER",
- [(X86MemBarrier)]>;
+ [(X86MemBarrier)]>, Sched<[WriteLoad]>;
// RegOpc corresponds to the mr version of the instruction
// ImmOpc corresponds to the mi version of the instruction
@@ -607,7 +604,8 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
// ImmMod corresponds to the instruction format of the mi and mi8 versions
multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
Format ImmMod, string mnemonic> {
-let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
@@ -694,7 +692,8 @@ defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
// Optimized codegen when the non-memory output is not used.
multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
string mnemonic> {
-let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
!strconcat(mnemonic, "{b}\t$dst"),
@@ -728,7 +727,7 @@ let isCodeGenOnly = 1 in {
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
string mnemonic, SDPatternOperator frag,
InstrItinClass itin8, InstrItinClass itin> {
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
let Defs = [AL, EFLAGS], Uses = [AL] in
def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
!strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
@@ -748,14 +747,15 @@ let isCodeGenOnly = 1 in {
}
}
-let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+ SchedRW = [WriteALULd, WriteRMW] in {
defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
X86cas8, i64mem,
IIC_CMPX_LOCK_8B>;
}
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b] in {
+ Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
X86cas16, i128mem,
IIC_CMPX_LOCK_16B>, REX_W;
@@ -768,7 +768,8 @@ defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
string frag,
InstrItinClass itin8, InstrItinClass itin> {
- let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
+ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+ SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
@@ -932,20 +933,6 @@ def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
(MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
-// In static codegen with small code model, we can get the address of a label
-// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of
-// the MOV64ri64i32 should accept these.
-def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
- (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
- (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
- (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
- (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
- (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
-
// In kernel code model, we can get the address of a label
// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
// the MOV64ri32 should accept these.
@@ -990,9 +977,6 @@ def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
// This corresponds to add $foo@tpoff, %rax
def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
(ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
-// This corresponds to mov foo@tpoff(%rbx), %eax
-def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))),
- (MOV64rm tglobaltlsaddr :$dst)>;
// Direct PC relative function call for small code model. 32-bit displacement
@@ -1112,7 +1096,8 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
// extload bool -> extload byte
// When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1126,14 +1111,16 @@ def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
-def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
-def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>;
-def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
// For other extloads, use subregs, since the high contents of the register are
// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
def : Pat<(extloadi64i32 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
- sub_32bit)>;
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
// anyext. Define these to do an explicit zero-extend to
// avoid partial-register updates.
@@ -1145,8 +1132,10 @@ def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
def : Pat<(i32 (anyext GR16:$src)),
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
-def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>;
-def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR8 :$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
def : Pat<(i64 (anyext GR32:$src)),
(SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
@@ -1192,7 +1181,8 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
-let AddedComplexity = 5 in { // Try this before the selecting to OR
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
let isConvertibleToThreeAddress = 1,
Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
@@ -1239,7 +1229,7 @@ def ADD64ri32_DB : I<0, Pseudo,
[(set GR64:$dst, (or_is_add GR64:$src1,
i64immSExt32:$src2))]>;
}
-} // AddedComplexity
+} // AddedComplexity, SchedRW
//===----------------------------------------------------------------------===//
@@ -1310,13 +1300,19 @@ def : Pat<(and GR16:$src1, 0xff),
// r & (2^32-1) ==> movz
def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
- (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+ (SUBREG_TO_REG (i64 0),
+ (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+ sub_32bit)>;
// r & (2^16-1) ==> movz
def : Pat<(and GR64:$src, 0xffff),
- (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+ sub_32bit)>;
// r & (2^8-1) ==> movz
def : Pat<(and GR64:$src, 0xff),
- (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+ (SUBREG_TO_REG (i64 0),
+ (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+ sub_32bit)>;
// r & (2^8-1) ==> movz
def : Pat<(and GR32:$src1, 0xff),
(MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index bfe9541..0e69651 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -20,7 +20,7 @@
// The X86retflag return instructions are variadic because we may add ST0 and
// ST1 arguments when returning values on the x87 stack.
let isTerminator = 1, isReturn = 1, isBarrier = 1,
- hasCtrlDep = 1, FPForm = SpecialFP in {
+ hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
"ret",
[(X86retflag 0)], IIC_RET>;
@@ -46,7 +46,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
}
// Unconditional branches.
-let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
"jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
@@ -58,7 +58,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
}
// Conditional Branches.
-let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in {
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
IIC_Jcc>;
@@ -85,7 +85,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
// jcx/jecx/jrcx instructions.
-let isBranch = 1, isTerminator = 1 in {
+let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
// These are the 32-bit versions of this instruction for the asmparser. In
// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
// jecxz.
@@ -110,36 +110,46 @@ let isBranch = 1, isTerminator = 1 in {
// Indirect branches
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
- [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>;
+ [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>,
+ Sched<[WriteJump]>;
def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
- [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>;
+ [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In32BitMode]>, Sched<[WriteJumpLd]>;
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
- [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>;
+ [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+ Sched<[WriteJump]>;
def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
- [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>;
+ [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+ Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
- "ljmp{w}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>, OpSize;
+ "ljmp{w}\t{$seg, $off|$off, $seg}", [],
+ IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>;
def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
- "ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>;
+ "ljmp{l}\t{$seg, $off|$off, $seg}", [],
+ IIC_JMP_FAR_PTR>, Sched<[WriteJump]>;
def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
- "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
+ "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJump]>;
def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
- "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize;
+ "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize,
+ Sched<[WriteJumpLd]>;
def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
- "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
+ "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+ Sched<[WriteJumpLd]>;
}
// Loop instructions
-
+let SchedRW = [WriteJump] in {
def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
//===----------------------------------------------------------------------===//
// Call Instructions...
@@ -152,27 +162,32 @@ let isCall = 1 in
let Uses = [ESP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i32imm_pcrel:$dst),
- "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
+ "call{l}\t$dst", [], IIC_CALL_RI>,
+ Requires<[In32BitMode]>, Sched<[WriteJump]>;
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
- Requires<[In32BitMode]>;
+ Requires<[In32BitMode]>, Sched<[WriteJump]>;
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
- "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
- Requires<[In32BitMode]>;
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In32BitMode,FavorMemIndirectCall]>,
+ Sched<[WriteJumpLd]>;
def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
"lcall{w}\t{$seg, $off|$off, $seg}", [],
- IIC_CALL_FAR_PTR>, OpSize;
+ IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>;
def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
"lcall{l}\t{$seg, $off|$off, $seg}", [],
- IIC_CALL_FAR_PTR>;
+ IIC_CALL_FAR_PTR>, Sched<[WriteJump]>;
def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
- "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize;
+ "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize,
+ Sched<[WriteJumpLd]>;
def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
- "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+ "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>,
+ Sched<[WriteJumpLd]>;
// callw for 16 bit code for the assembler.
let isAsmParserOnly = 1 in
@@ -185,7 +200,7 @@ let isCall = 1 in
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1 in
+ isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
let Uses = [ESP] in {
def TCRETURNdi : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset), []>;
@@ -216,7 +231,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
// RSP is marked as a use to prevent stack-pointer assignments that appear
// immediately before calls from potentially appearing dead. Uses for argument
// registers are added manually.
-let isCall = 1, Uses = [RSP] in {
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
// NOTE: this pattern doesn't match "X86call imm", because we do not know
// that the offset between an arbitrary immediate and the call will fit in
// the 32-bit pcrel field that we have.
@@ -231,7 +246,7 @@ let isCall = 1, Uses = [RSP] in {
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
IIC_CALL_MEM>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode,FavorMemIndirectCall]>;
def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
"lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
@@ -245,13 +260,12 @@ let isCall = 1, isCodeGenOnly = 1 in
def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i64i32imm_pcrel:$dst),
"call{q}\t$dst", [], IIC_CALL_RI>,
- Requires<[IsWin64]>;
+ Requires<[IsWin64]>, Sched<[WriteJump]>;
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1 in
- let Uses = [RSP],
- usesCustomInserter = 1 in {
+ isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+ SchedRW = [WriteJump] in {
def TCRETURNdi64 : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset),
[]>;
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 2eb454d..28954c6 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -42,48 +42,54 @@ let neverHasSideEffects = 1 in {
let neverHasSideEffects = 1 in {
def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
"movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
- TB, OpSize;
+ TB, OpSize, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
"movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
- TB, OpSize;
+ TB, OpSize, Sched<[WriteALULd]>;
} // neverHasSideEffects = 1
def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB;
+ [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB;
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALULd]>;
def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB;
+ [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
- TB;
+ TB, Sched<[WriteALULd]>;
let neverHasSideEffects = 1 in {
def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
"movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
- TB, OpSize;
+ TB, OpSize, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
"movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
- TB, OpSize;
+ TB, OpSize, Sched<[WriteALULd]>;
} // neverHasSideEffects = 1
def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB;
+ [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB;
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALULd]>;
def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB;
+ [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+ Sched<[WriteALU]>;
def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
- TB;
+ TB, Sched<[WriteALULd]>;
// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
// except that they use GR32_NOREX for the output operand register class
@@ -92,12 +98,12 @@ let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [], IIC_MOVZX>, TB;
+ [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
let mayLoad = 1 in
def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [], IIC_MOVZX>, TB;
+ [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
}
// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
@@ -106,68 +112,61 @@ def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
// were generalized, this would require a special register class.
def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB;
+ [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
- TB;
+ TB, Sched<[WriteALULd]>;
def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB;
+ [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+ Sched<[WriteALU]>;
def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
- TB;
+ TB, Sched<[WriteALULd]>;
def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>;
+ [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+ Sched<[WriteALU]>;
def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>;
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+ Sched<[WriteALULd]>;
// movzbq and movzwq encodings for the disassembler
def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
"movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
- TB;
+ TB, Sched<[WriteALU]>;
def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
"movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
- TB;
+ TB, Sched<[WriteALULd]>;
def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
"movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
- TB;
+ TB, Sched<[WriteALU]>;
def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
"movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
- TB;
-
-// FIXME: These should be Pat patterns.
-let isCodeGenOnly = 1 in {
-
-// Use movzbl instead of movzbq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
- "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB;
-def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
- "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
- TB;
-// Use movzwl instead of movzwq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
- "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB;
-def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
- IIC_MOVZX>, TB;
-
-// There's no movzlq instruction, but movl can be used for this purpose, using
-// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
-// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
-// zero-extension, however this isn't possible when the 32-bit value is
-// defined by a truncate or is copied from something where the high bits aren't
-// necessarily all zero. In such cases, we fall back to these explicit zext
-// instructions.
-def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
- "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>;
-def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
- "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
- IIC_MOVZX>;
-}
-
+ TB, Sched<[WriteALULd]>;
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 568726e..7c37888 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm>
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
-def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">;
def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
-def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
-def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
-def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">;
def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
-def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
-def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">;
@@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovb\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmove\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovu\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovne\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB;
} // Predicates = [HasCMov]
// Floating point loads & stores.
@@ -422,7 +422,7 @@ def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
}
-let mayLoad = 1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
IIC_FLD>;
def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
@@ -436,7 +436,7 @@ def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
IIC_FILD>;
}
-let mayStore = 1 in {
+let mayStore = 1, SchedRW = [WriteStore] in {
def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
IIC_FST>;
def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
@@ -481,7 +481,7 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
[(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
} // Predicates = [HasSSE3]
-let mayStore = 1 in {
+let mayStore = 1, SchedRW = [WriteStore] in {
def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
IIC_FST>;
def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
@@ -491,6 +491,7 @@ def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
}
// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
IIC_FLD>, D9;
def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
@@ -499,6 +500,7 @@ def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
IIC_FST>, DD;
def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
IIC_FXCH>, D9;
+}
// Floating point constant loads.
let isReMaterializable = 1 in {
@@ -516,19 +518,23 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
+let SchedRW = [WriteZero] in {
def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
-
+}
// Floating point compares.
+let SchedRW = [WriteFAdd] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
[(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
} // Defs = [FPSW]
+let SchedRW = [WriteFAdd] in {
// CC = ST(0) cmp ST(i)
let Defs = [EFLAGS, FPSW] in {
def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
@@ -566,33 +572,38 @@ def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
"fcompi\t$reg", IIC_FCOMI>, DF;
}
+} // SchedRW
// Floating point flag ops.
+let SchedRW = [WriteALU] in {
let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags
- (outs), (ins), "fnstsw %ax",
+ (outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
[(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
-
+} // SchedRW
let mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
- (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>;
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+ Sched<[WriteLoad]>;
// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in
def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
"ffree\t$reg", IIC_FFREE>, DD;
-
// Clear exceptions
let Defs = [FPSW] in
def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
+} // SchedRW
// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
@@ -627,6 +638,7 @@ def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
"fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
Requires<[In64BitMode]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 44e574d..64018b3 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -35,24 +35,27 @@ def MRM_C3 : Format<35>;
def MRM_C4 : Format<36>;
def MRM_C8 : Format<37>;
def MRM_C9 : Format<38>;
-def MRM_E8 : Format<39>;
-def MRM_F0 : Format<40>;
-def MRM_F8 : Format<41>;
-def MRM_F9 : Format<42>;
+def MRM_CA : Format<39>;
+def MRM_CB : Format<40>;
+def MRM_E8 : Format<41>;
+def MRM_F0 : Format<42>;
def RawFrmImm8 : Format<43>;
def RawFrmImm16 : Format<44>;
-def MRM_D0 : Format<45>;
-def MRM_D1 : Format<46>;
-def MRM_D4 : Format<47>;
-def MRM_D5 : Format<48>;
-def MRM_D8 : Format<49>;
-def MRM_D9 : Format<50>;
-def MRM_DA : Format<51>;
-def MRM_DB : Format<52>;
-def MRM_DC : Format<53>;
-def MRM_DD : Format<54>;
-def MRM_DE : Format<55>;
-def MRM_DF : Format<56>;
+def MRM_F8 : Format<45>;
+def MRM_F9 : Format<46>;
+def MRM_D0 : Format<47>;
+def MRM_D1 : Format<48>;
+def MRM_D4 : Format<49>;
+def MRM_D5 : Format<50>;
+def MRM_D6 : Format<51>;
+def MRM_D8 : Format<52>;
+def MRM_D9 : Format<53>;
+def MRM_DA : Format<54>;
+def MRM_DB : Format<55>;
+def MRM_DC : Format<56>;
+def MRM_DD : Format<57>;
+def MRM_DE : Format<58>;
+def MRM_DF : Format<59>;
// ImmType - This specifies the immediate type used by an instruction. This is
// part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -93,6 +96,20 @@ def SSEPackedSingle : Domain<1>;
def SSEPackedDouble : Domain<2>;
def SSEPackedInt : Domain<3>;
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+ bits<3> Value = val;
+}
+def CD8VF : CD8VForm<0>; // v := VL
+def CD8VH : CD8VForm<1>; // v := VL/2
+def CD8VQ : CD8VForm<2>; // v := VL/4
+def CD8VO : CD8VForm<3>; // v := VL/8
+def CD8VT1 : CD8VForm<4>; // v := 1
+def CD8VT2 : CD8VForm<5>; // v := 2
+def CD8VT4 : CD8VForm<6>; // v := 4
+def CD8VT8 : CD8VForm<7>; // v := 8
+
// Prefix byte classes which are used to indicate to the ad-hoc machine code
// emitter that various prefix bytes are required.
class OpSize { bit hasOpSizePrefix = 1; }
@@ -129,6 +146,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { bit hasEVEXPrefix = 1; }
+class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_CD8<int esize, CD8VForm form> {
+ bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00,
+ !if(!eq(esize, 16), 0b01,
+ !if(!eq(esize, 32), 0b10,
+ !if(!eq(esize, 64), 0b11, ?))));
+ bits<3> EVEX_CD8V = form.Value;
+}
class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
class MemOp4 { bit hasMemOp4Prefix = 1; }
class XOP { bit hasXOP_Prefix = 1; }
@@ -174,6 +204,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// to be encoded in a immediate field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit hasEVEXPrefix = 0; // Does this inst require EVEX form?
+ bit hasEVEX_K = 0; // Does this inst require masking?
+ bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field?
+ bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field?
+ bit hasEVEX_B = 0; // Does this inst set the EVEX_B field?
+ bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size.
+ bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width.
bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix?
@@ -197,9 +234,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{37} = hasVEX_i8ImmReg;
let TSFlags{38} = hasVEX_L;
let TSFlags{39} = ignoresVEX_L;
- let TSFlags{40} = has3DNow0F0FOpcode;
- let TSFlags{41} = hasMemOp4Prefix;
- let TSFlags{42} = hasXOP_Prefix;
+ let TSFlags{40} = hasEVEXPrefix;
+ let TSFlags{41} = hasEVEX_K;
+ let TSFlags{42} = hasEVEX_Z;
+ let TSFlags{43} = hasEVEX_L2;
+ let TSFlags{44} = hasEVEX_B;
+ let TSFlags{46-45} = EVEX_CD8E;
+ let TSFlags{49-47} = EVEX_CD8V;
+ let TSFlags{50} = has3DNow0F0FOpcode;
+ let TSFlags{51} = hasMemOp4Prefix;
+ let TSFlags{52} = hasXOP_Prefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -208,47 +252,47 @@ class PseudoI<dag oops, dag iops, list<dag> pattern>
}
class I<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
Domain d = GenericDomain>
: X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
Domain d = GenericDomain>
: X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm16, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm32, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
@@ -257,12 +301,12 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
// FPStack Instruction Templates:
// FPI - Floating Point Instruction template.
class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
- InstrItinClass itin = IIC_DEFAULT>
+ InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, [], itin> {}
// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
- InstrItinClass itin = IIC_DEFAULT>
+ InstrItinClass itin = NoItinerary>
: X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
let FPForm = fp;
let Pattern = pattern;
@@ -275,27 +319,30 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
// Iseg32 - 16-bit segment selector, 32-bit offset
class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm16, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm32, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
def __xs : XS;
+def __xd : XD;
// SI - SSE 1 & 2 scalar instructions
class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
- !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+ !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
+ !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
+ !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))));
// AVX instructions have a 'v' prefix in the mnemonic
let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -303,7 +350,7 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SIi8 - SSE 1 & 2 scalar instructions
class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
@@ -347,28 +394,28 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// PSI - SSE1 instructions with TB prefix.
// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
// VSSI - SSE1 instructions with XS prefix in AVX form.
-// VPSI - SSE1 instructions with TB prefix in AVX form.
+// VPSI - SSE1 instructions with TB prefix in AVX form, packed single.
class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
Requires<[UseSSE1]>;
class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
Requires<[UseSSE1]>;
class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
Requires<[HasAVX]>;
class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB,
Requires<[HasAVX]>;
@@ -378,52 +425,63 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
// S2SI - SSE2 instructions with XS prefix.
// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
-// PDI - SSE2 instructions with TB and OpSize prefixes.
+// PDI - SSE2 instructions with TB and OpSize prefixes, packed double domain.
// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
-// VSDI - SSE2 instructions with XD prefix in AVX form.
-// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form.
+// VSDI - SSE2 scalar instructions with XD prefix in AVX form.
+// VPDI - SSE2 vector instructions with TB and OpSize prefixes in AVX form,
+// packed double domain.
+// VS2I - SSE2 scalar instructions with TB and OpSize prefixes in AVX form.
+// S2I - SSE2 scalar instructions with TB and OpSize prefixes.
// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
// MMX operands.
// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
// MMX operands.
class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[UseSSE2]>;
class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[UseSSE2]>;
class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
Requires<[HasAVX]>;
class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
Requires<[HasAVX]>;
class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
OpSize, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
+ OpSize, Requires<[HasAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB,
+ OpSize, Requires<[UseSSE2]>;
class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
// SSE3 Instruction Templates:
@@ -433,15 +491,15 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// S3DI - SSE3 instructions with XD prefix.
class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
Requires<[UseSSE3]>;
class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
Requires<[UseSSE3]>;
class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[UseSSE3]>;
@@ -458,19 +516,19 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
// classes. They need to be enabled even if AVX is enabled.
class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[UseSSSE3]>;
class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[UseSSSE3]>;
class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[HasSSSE3]>;
class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[HasSSSE3]>;
@@ -480,11 +538,11 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
//
class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[UseSSE41]>;
class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[UseSSE41]>;
@@ -492,19 +550,19 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
//
// SS428I - SSE 4.2 instructions with T8 prefix.
class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[UseSSE42]>;
// SS42FI - SSE 4.2 instructions with T8XD prefix.
// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
// SS42AI = SSE 4.2 instructions with TA prefix
class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[UseSSE42]>;
@@ -514,11 +572,11 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX8I - AVX instructions with T8 and OpSize prefix.
// AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
Requires<[HasAVX]>;
class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
Requires<[HasAVX]>;
@@ -528,66 +586,134 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX28I - AVX2 instructions with T8 and OpSize prefix.
// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8.
class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
Requires<[HasAVX2]>;
class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
Requires<[HasAVX2]>;
+
+// AVX-512 Instruction Templates:
+// Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+// AVX5128I - AVX-512 instructions with T8 and OpSize prefix.
+// AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8.
+// AVX512PDI - AVX-512 instructions with TB, OpSize, double packed.
+// AVX512PSI - AVX-512 instructions with TB, single packed.
+// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+// AVX512XSI - AVX-512 instructions with XS prefix, generic domain.
+// AVX512BI - AVX-512 instructions with TB, OpSize, int packed domain.
+// AVX512SI - AVX-512 scalar instructions with TB and OpSize prefixes.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+ Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS,
+ Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+ Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+ Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB,
+ OpSize, Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+ Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8,
+ OpSize, EVEX_4V, Requires<[HasAVX512]>;
+
// AES Instruction Templates:
//
// AES8I
// These use the same encoding as the SSE4.2 T8 and TA encodings.
class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[HasAES]>;
class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[HasAES]>;
// PCLMUL Instruction Templates
class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, Requires<[HasPCLMUL]>;
class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
// FMA3 Instruction Templates
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8,
- OpSize, VEX_4V, Requires<[HasFMA]>;
+ OpSize, VEX_4V, FMASC, Requires<[HasFMA]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
- OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>;
+ OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP, XOP9, Requires<[HasXOP]>;
// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP, XOP8, Requires<[HasXOP]>;
// XOP 5 operand instruction (VEX encoding!)
class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
@@ -595,34 +721,47 @@ class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
//
class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
let Pattern = pattern;
let CodeSize = 3;
}
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: SDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: PDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
// MMX Instruction templates
//
@@ -635,23 +774,23 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
// MMXID - MMX instructions with XD prefix.
// MMXIS - MMX instructions with XS prefix.
class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>;
class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>;
class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 2a72fb6..0b51521 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -47,6 +47,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
@@ -136,6 +138,8 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -153,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+
def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
@@ -192,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
@@ -405,28 +412,54 @@ def BYTE_imm : SDNodeXForm<imm, [{
return getI32Imm(N->getZExtValue() >> 3);
}]>;
-// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
-// to VEXTRACTF128 imm.
-def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
- return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT128Immediate(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT128Immediate(N));
}]>;
-// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
-// VINSERTF128 imm.
-def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
- return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT256Immediate(N));
}]>;
-def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT256Immediate(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
node:$index), [{
- return X86::isVEXTRACTF128Index(N);
-}], EXTRACT_get_vextractf128_imm>;
+ return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
-def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
(insert_subvector node:$bigvec, node:$smallvec,
node:$index), [{
- return X86::isVINSERTF128Index(N);
-}], INSERT_get_vinsertf128_imm>;
+ return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 17714ac..0443a93 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -97,7 +97,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
(tm.getSubtarget<X86Subtarget>().is64Bit()
? X86::ADJCALLSTACKUP64
: X86::ADJCALLSTACKUP32)),
- TM(tm), RI(tm, *this) {
+ TM(tm), RI(tm) {
static const X86OpTblEntry OpTbl2Addr[] = {
{ X86::ADC32ri, X86::ADC32mi, 0 },
@@ -451,9 +451,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
- { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 },
- { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 },
- { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 },
{ X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
{ X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
{ X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
@@ -1381,7 +1378,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
- case X86::MOVZX64rr8:
if (!TM.getSubtarget<X86Subtarget>().is64Bit())
// It's not always legal to reference the low 8-bit of the larger
// register in 32-bit mode.
@@ -1389,9 +1385,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
- case X86::MOVZX64rr16:
- case X86::MOVSX64rr32:
- case X86::MOVZX64rr32: {
+ case X86::MOVSX64rr32: {
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
// Be conservative.
return false;
@@ -1404,17 +1398,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
- case X86::MOVZX64rr8:
SubIdx = X86::sub_8bit;
break;
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
- case X86::MOVZX64rr16:
SubIdx = X86::sub_16bit;
break;
case X86::MOVSX64rr32:
- case X86::MOVZX64rr32:
SubIdx = X86::sub_32bit;
break;
}
@@ -1722,37 +1713,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SubIdx,
const MachineInstr *Orig,
const TargetRegisterInfo &TRI) const {
- DebugLoc DL = Orig->getDebugLoc();
-
- // MOV32r0 etc. are implemented with xor which clobbers condition code.
- // Re-materialize them as movri instructions to avoid side effects.
- bool Clone = true;
+ // MOV32r0 is implemented with a xor which clobbers condition code.
+ // Re-materialize it as movri instructions to avoid side effects.
unsigned Opc = Orig->getOpcode();
- switch (Opc) {
- default: break;
- case X86::MOV8r0:
- case X86::MOV16r0:
- case X86::MOV32r0:
- case X86::MOV64r0: {
- if (!isSafeToClobberEFLAGS(MBB, I)) {
- switch (Opc) {
- default: llvm_unreachable("Unreachable!");
- case X86::MOV8r0: Opc = X86::MOV8ri; break;
- case X86::MOV16r0: Opc = X86::MOV16ri; break;
- case X86::MOV32r0: Opc = X86::MOV32ri; break;
- case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
- }
- Clone = false;
- }
- break;
- }
- }
-
- if (Clone) {
+ if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+ DebugLoc DL = Orig->getDebugLoc();
+ BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+ .addImm(0);
+ } else {
MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
MBB.insert(I, MI);
- } else {
- BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
}
MachineInstr *NewMI = prior(I);
@@ -1772,6 +1742,98 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
return false;
}
+/// getTruncatedShiftCount - check whether the shift count for a machine operand
+/// is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+ unsigned ShiftAmtOperandIdx) {
+ // The shift count is six bits with the REX.W prefix and five bits without.
+ unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+ unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+ return Imm & ShiftCountMask;
+}
+
+/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+ // Left shift instructions can be transformed into load-effective-address
+ // instructions if we can encode them appropriately.
+ // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+ // The SIB.scale field is two bits wide which means that we can encode any
+ // shift amount less than 4.
+ return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+ unsigned Opc, bool AllowSP,
+ unsigned &NewSrc, bool &isKill, bool &isUndef,
+ MachineOperand &ImplicitOp) const {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ const TargetRegisterClass *RC;
+ if (AllowSP) {
+ RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+ } else {
+ RC = Opc != X86::LEA32r ?
+ &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+ }
+ unsigned SrcReg = Src.getReg();
+
+ // For both LEA64 and LEA32 the register already has essentially the right
+ // type (32-bit or 64-bit) we may just need to forbid SP.
+ if (Opc != X86::LEA64_32r) {
+ NewSrc = SrcReg;
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
+
+ if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+ !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ return false;
+
+ return true;
+ }
+
+ // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+ // another we need to add 64-bit registers to the final MI.
+ if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ ImplicitOp = Src;
+ ImplicitOp.setImplicit();
+
+ NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+ MachineBasicBlock::LivenessQueryResult LQR =
+ MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+ switch (LQR) {
+ case MachineBasicBlock::LQR_Unknown:
+ // We can't give sane liveness flags to the instruction, abandon LEA
+ // formation.
+ return false;
+ case MachineBasicBlock::LQR_Live:
+ isKill = MI->killsRegister(SrcReg);
+ isUndef = false;
+ break;
+ default:
+ // The physreg itself is dead, so we have to use it as an <undef>.
+ isKill = false;
+ isUndef = true;
+ break;
+ }
+ } else {
+ // Virtual register of the wrong class, we have to create a temporary 64-bit
+ // vreg to feed into the LEA.
+ NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ get(TargetOpcode::COPY))
+ .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+ .addOperand(Src);
+
+ // Which is obviously going to be dead after we're done with it.
+ isKill = true;
+ isUndef = false;
+ }
+
+ // We've set all the parameters without issue.
+ return true;
+}
+
/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
/// to a 32-bit superregister and then truncating back down to a 16-bit
@@ -1787,11 +1849,16 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
bool isDead = MI->getOperand(0).isDead();
bool isKill = MI->getOperand(1).isKill();
- unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::LEA64_32r : X86::LEA32r;
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
- unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ unsigned Opc, leaInReg;
+ if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+ Opc = X86::LEA64_32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ } else {
+ Opc = X86::LEA32r;
+ leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ }
// Build and insert into an implicit UNDEF value. This is OK because
// well be shifting and then extracting the lower 16-bits.
@@ -1841,7 +1908,10 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
// just a single insert_subreg.
addRegReg(MIB, leaInReg, true, leaInReg, false);
} else {
- leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ else
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
// Build and insert into an implicit UNDEF value. This is OK because
// well be shifting and then extracting the lower 16-bits.
BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
@@ -1891,6 +1961,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const {
MachineInstr *MI = MBBI;
+
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return 0;
+
MachineFunction &MF = *MI->getParent()->getParent();
// All instructions input are two-addr instructions. Get the known operands.
const MachineOperand &Dest = MI->getOperand(0);
@@ -1935,10 +2012,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
case X86::SHL64ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
- // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
- // the flags produced by a shift yet, so this is safe.
- unsigned ShAmt = MI->getOperand(2).getImm();
- if (ShAmt == 0 || ShAmt >= 4) return 0;
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
// LEA can't handle RSP.
if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
@@ -1953,29 +2028,34 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
case X86::SHL32ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
- // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
- // the flags produced by a shift yet, so this is safe.
- unsigned ShAmt = MI->getOperand(2).getImm();
- if (ShAmt == 0 || ShAmt >= 4) return 0;
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
// LEA can't handle ESP.
- if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
- !MF.getRegInfo().constrainRegClass(Src.getReg(),
- &X86::GR32_NOSPRegClass))
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
return 0;
- unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
- NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
.addOperand(Dest)
- .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+ .addImm(0).addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ NewMI = MIB;
+
break;
}
case X86::SHL16ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
- // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
- // the flags produced by a shift yet, so this is safe.
- unsigned ShAmt = MI->getOperand(2).getImm();
- if (ShAmt == 0 || ShAmt >= 4) return 0;
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
@@ -1985,11 +2065,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
break;
}
default: {
- // The following opcodes also sets the condition code register(s). Only
- // convert them to equivalent lea if the condition code register def's
- // are dead!
- if (hasLiveCondCodeDef(MI))
- return 0;
switch (MIOpc) {
default: return 0;
@@ -1999,17 +2074,20 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
- const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
- (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
- (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
-
- // LEA can't handle RSP.
- if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
- !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
return 0;
- NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
- .addOperand(Dest).addOperand(Src), 1);
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, 1);
break;
}
case X86::INC16r:
@@ -2026,16 +2104,22 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
- const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
- (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
- (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
- // LEA can't handle RSP.
- if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
- !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+ SrcReg, isKill, isUndef, ImplicitOp))
return 0;
- NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
- .addOperand(Dest).addOperand(Src), -1);
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, -1);
+
break;
}
case X86::DEC16r:
@@ -2052,36 +2136,41 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::ADD32rr_DB: {
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc;
- const TargetRegisterClass *RC;
- if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
+ if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
Opc = X86::LEA64r;
- RC = &X86::GR64_NOSPRegClass;
- } else {
+ else
Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
- RC = &X86::GR32_NOSPRegClass;
- }
-
- unsigned Src2 = MI->getOperand(2).getReg();
- bool isKill2 = MI->getOperand(2).isKill();
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return 0;
- // LEA can't handle RSP.
- if (TargetRegisterInfo::isVirtualRegister(Src2) &&
- !MF.getRegInfo().constrainRegClass(Src2, RC))
+ const MachineOperand &Src2 = MI->getOperand(2);
+ bool isKill2, isUndef2;
+ unsigned SrcReg2;
+ MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+ SrcReg2, isKill2, isUndef2, ImplicitOp2))
return 0;
- NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
- .addOperand(Dest),
- Src.getReg(), Src.isKill(), Src2, isKill2);
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest);
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+ if (ImplicitOp2.getReg() != 0)
+ MIB.addOperand(ImplicitOp2);
+
+ NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
// Preserve undefness of the operands.
- bool isUndef = MI->getOperand(1).isUndef();
- bool isUndef2 = MI->getOperand(2).isUndef();
NewMI->getOperand(1).setIsUndef(isUndef);
NewMI->getOperand(3).setIsUndef(isUndef2);
- if (LV && isKill2)
- LV->replaceKillInstruction(Src2, MI, NewMI);
+ if (LV && Src2.isKill())
+ LV->replaceKillInstruction(SrcReg2, MI, NewMI);
break;
}
case X86::ADD16rr:
@@ -2120,9 +2209,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::ADD32ri8_DB: {
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
- NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
- .addOperand(Dest).addOperand(Src),
- MI->getOperand(2).getImm());
+
+ bool isKill, isUndef;
+ unsigned SrcReg;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+ SrcReg, isKill, isUndef, ImplicitOp))
+ return 0;
+
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addOperand(Dest)
+ .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.addOperand(ImplicitOp);
+
+ NewMI = addOffset(MIB, MI->getOperand(2).getImm());
break;
}
case X86::ADD16ri:
@@ -3171,6 +3272,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
inline static bool isDefConvertible(MachineInstr *MI) {
switch (MI->getOpcode()) {
default: return false;
+
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
+ return getTruncatedShiftCount(MI, 2) != 0;
+
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+ return ShAmt != 0;
+ }
+
+ case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+ case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+ return getTruncatedShiftCount(MI, 3) != 0;
+
case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
@@ -3200,8 +3320,37 @@ inline static bool isDefConvertible(MachineInstr *MI) {
case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
+ case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
+ case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
+ case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
+ case X86::ADC32ri: case X86::ADC32ri8:
+ case X86::ADC32rr: case X86::ADC64ri32:
+ case X86::ADC64ri8: case X86::ADC64rr:
+ case X86::SBB32ri: case X86::SBB32ri8:
+ case X86::SBB32rr: case X86::SBB64ri32:
+ case X86::SBB64ri8: case X86::SBB64rr:
case X86::ANDN32rr: case X86::ANDN32rm:
case X86::ANDN64rr: case X86::ANDN64rm:
+ case X86::BEXTR32rr: case X86::BEXTR64rr:
+ case X86::BEXTR32rm: case X86::BEXTR64rm:
+ case X86::BLSI32rr: case X86::BLSI32rm:
+ case X86::BLSI64rr: case X86::BLSI64rm:
+ case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+ case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+ case X86::BLSR32rr: case X86::BLSR32rm:
+ case X86::BLSR64rr: case X86::BLSR64rm:
+ case X86::BZHI32rr: case X86::BZHI32rm:
+ case X86::BZHI64rr: case X86::BZHI64rm:
+ case X86::LZCNT16rr: case X86::LZCNT16rm:
+ case X86::LZCNT32rr: case X86::LZCNT32rm:
+ case X86::LZCNT64rr: case X86::LZCNT64rm:
+ case X86::POPCNT16rr:case X86::POPCNT16rm:
+ case X86::POPCNT32rr:case X86::POPCNT32rm:
+ case X86::POPCNT64rr:case X86::POPCNT64rm:
+ case X86::TZCNT16rr: case X86::TZCNT16rm:
+ case X86::TZCNT32rr: case X86::TZCNT32rm:
+ case X86::TZCNT64rr: case X86::TZCNT64rm:
return true;
}
}
@@ -3308,10 +3457,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
// MOV32r0 etc. are implemented with xor which clobbers condition code.
// They are safe to move up, if the definition to EFLAGS is dead and
// earlier instructions do not read or write EFLAGS.
- if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
- Instr->getOpcode() == X86::MOV16r0 ||
- Instr->getOpcode() == X86::MOV32r0 ||
- Instr->getOpcode() == X86::MOV64r0) &&
+ if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
Movr0Inst = Instr;
continue;
@@ -3420,20 +3566,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
// The instruction to be updated is either Sub or MI.
Sub = IsCmpZero ? MI : Sub;
- // Move Movr0Inst to the place right before Sub.
+ // Move Movr0Inst to the appropriate place before Sub.
if (Movr0Inst) {
- Sub->getParent()->remove(Movr0Inst);
- Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+ // Look backwards until we find a def that doesn't use the current EFLAGS.
+ Def = Sub;
+ MachineBasicBlock::reverse_iterator
+ InsertI = MachineBasicBlock::reverse_iterator(++Def),
+ InsertE = Sub->getParent()->rend();
+ for (; InsertI != InsertE; ++InsertI) {
+ MachineInstr *Instr = &*InsertI;
+ if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+ Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+ Sub->getParent()->remove(Movr0Inst);
+ Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+ Movr0Inst);
+ break;
+ }
+ }
+ if (InsertI == InsertE)
+ return false;
}
// Make sure Sub instruction defines EFLAGS and mark the def live.
- unsigned LastOperand = Sub->getNumOperands() - 1;
- assert(Sub->getNumOperands() >= 2 &&
- Sub->getOperand(LastOperand).isReg() &&
- Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
- "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
- Sub->getOperand(LastOperand).setIsDef(true);
- Sub->getOperand(LastOperand).setIsDead(false);
+ unsigned i = 0, e = Sub->getNumOperands();
+ for (; i != e; ++i) {
+ MachineOperand &MO = Sub->getOperand(i);
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ MO.setIsDead(false);
+ break;
+ }
+ }
+ assert(i != e && "Unable to locate a def EFLAGS operand");
+
CmpInstr->eraseFromParent();
// Modify the condition code of instructions in OpsToUpdate.
@@ -3569,19 +3733,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return false;
}
-MachineInstr*
-X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
- int FrameIx, uint64_t Offset,
- const MDNode *MDPtr,
- DebugLoc DL) const {
- X86AddressMode AM;
- AM.BaseType = X86AddressMode::FrameIndexBase;
- AM.Base.FrameIndex = FrameIx;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
- addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
- return &*MIB;
-}
-
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
const SmallVectorImpl<MachineOperand> &MOs,
MachineInstr *MI,
@@ -3655,7 +3806,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Align) const {
const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+ bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
bool isTwoAddrFold = false;
+
+ // Atom favors register form of call. So, we do not fold loads into calls
+ // when X86Subtarget is Atom.
+ if (isCallRegIndirect &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+ return NULL;
+ }
+
unsigned NumOps = MI->getDesc().getNumOperands();
bool isTwoAddr = NumOps > 1 &&
MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
@@ -3677,18 +3837,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
isTwoAddrFold = true;
} else if (i == 0) { // If operand 0
- unsigned Opc = 0;
- switch (MI->getOpcode()) {
- default: break;
- case X86::MOV64r0: Opc = X86::MOV64mi32; break;
- case X86::MOV32r0: Opc = X86::MOV32mi; break;
- case X86::MOV16r0: Opc = X86::MOV16mi; break;
- case X86::MOV8r0: Opc = X86::MOV8mi; break;
+ if (MI->getOpcode() == X86::MOV32r0) {
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+ if (NewMI)
+ return NewMI;
}
- if (Opc)
- NewMI = MakeM0Inst(*this, Opc, MOs, MI);
- if (NewMI)
- return NewMI;
OpcodeTablePtr = &RegOp2MemOpTable0;
} else if (i == 1) {
@@ -4074,13 +4227,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
} else if (OpNum == 0) { // If operand 0
- switch (Opc) {
- case X86::MOV8r0:
- case X86::MOV16r0:
- case X86::MOV32r0:
- case X86::MOV64r0: return true;
- default: break;
- }
+ if (Opc == X86::MOV32r0)
+ return true;
+
OpcodeTablePtr = &RegOp2MemOpTable0;
} else if (OpNum == 1) {
OpcodeTablePtr = &RegOp2MemOpTable1;
@@ -4241,7 +4390,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
std::vector<SDValue> AddrOps;
std::vector<SDValue> BeforeOps;
std::vector<SDValue> AfterOps;
- DebugLoc dl = N->getDebugLoc();
+ SDLoc dl(N);
unsigned NumOps = N->getNumOperands();
for (unsigned i = 0; i != NumOps-1; ++i) {
SDValue Op = N->getOperand(i);
@@ -4272,7 +4421,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
- VT, MVT::Other, &AddrOps[0], AddrOps.size());
+ VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
// Preserve memory reference information.
@@ -4294,8 +4443,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
if (Load)
BeforeOps.push_back(SDValue(Load, 0));
std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
- SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0],
- BeforeOps.size());
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
NewNodes.push_back(NewNode);
// Emit the store instruction.
@@ -4317,8 +4465,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
(*MMOs.first)->getAlignment() >= Alignment;
SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
isAligned, TM),
- dl, MVT::Other,
- &AddrOps[0], AddrOps.size());
+ dl, MVT::Other, AddrOps);
NewNodes.push_back(Store);
// Preserve memory reference information.
@@ -4500,6 +4647,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const {
+ // Check if this processor supports macro-fusion. Since this is a minor
+ // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+ // proxy for SandyBridge+.
+ if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+ return false;
+
+ enum {
+ FuseTest,
+ FuseCmp,
+ FuseInc
+ } FuseKind;
+
+ switch(Second->getOpcode()) {
+ default:
+ return false;
+ case X86::JE_4:
+ case X86::JNE_4:
+ case X86::JL_4:
+ case X86::JLE_4:
+ case X86::JG_4:
+ case X86::JGE_4:
+ FuseKind = FuseInc;
+ break;
+ case X86::JB_4:
+ case X86::JBE_4:
+ case X86::JA_4:
+ case X86::JAE_4:
+ FuseKind = FuseCmp;
+ break;
+ case X86::JS_4:
+ case X86::JNS_4:
+ case X86::JP_4:
+ case X86::JNP_4:
+ case X86::JO_4:
+ case X86::JNO_4:
+ FuseKind = FuseTest;
+ break;
+ }
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ case X86::TEST8ri:
+ case X86::TEST16ri:
+ case X86::TEST32ri:
+ case X86::TEST32i32:
+ case X86::TEST64i32:
+ case X86::TEST64ri32:
+ case X86::TEST8rm:
+ case X86::TEST16rm:
+ case X86::TEST32rm:
+ case X86::TEST64rm:
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ return true;
+ case X86::CMP16i16:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP32i32:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP64i32:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP8i8:
+ case X86::CMP8ri:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri8_DB:
+ case X86::ADD16ri_DB:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri8_DB:
+ case X86::ADD32ri_DB:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8:
+ case X86::ADD64ri8_DB:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD8i8:
+ case X86::ADD8mi:
+ case X86::ADD8mr:
+ case X86::ADD8ri:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ return FuseKind == FuseCmp || FuseKind == FuseInc;
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64_16r:
+ case X86::INC64_32r:
+ case X86::INC64r:
+ case X86::INC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64_16r:
+ case X86::DEC64_32r:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FuseKind == FuseInc;
+ }
+}
bool X86InstrInfo::
ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 260f054..a0d1ba7 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -192,6 +192,19 @@ public:
const MachineInstr *Orig,
const TargetRegisterInfo &TRI) const;
+ /// Given an operand within a MachineInstr, insert preceding code to put it
+ /// into the right format for a particular kind of LEA instruction. This may
+ /// involve using an appropriate super-register instead (with an implicit use
+ /// of the original) or creating a new virtual register and inserting COPY
+ /// instructions to get the data into the right class.
+ ///
+ /// Reference parameters are set to indicate how caller should add this
+ /// operand to the LEA instruction.
+ bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+ unsigned LEAOpcode, bool AllowSP,
+ unsigned &NewSrc, bool &isKill,
+ bool &isUndef, MachineOperand &ImplicitOp) const;
+
/// convertToThreeAddress - This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
/// may be able to convert a two-address instruction into a true
@@ -262,12 +275,6 @@ public:
virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
- virtual
- MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
- int FrameIx, uint64_t Offset,
- const MDNode *MDPtr,
- DebugLoc DL) const;
-
/// foldMemoryOperand - If this target supports it, fold a load or store of
/// the specified stack slot into the specified machine instruction for the
/// specified operand(s). If this is possible, the target should perform the
@@ -332,6 +339,9 @@ public:
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const;
+ virtual bool shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const LLVM_OVERRIDE;
+
virtual void getNoopForMachoTarget(MCInst &NopInst) const;
virtual
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index d989ec7..0960a2a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -142,6 +142,9 @@ def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
[SDNPHasChain, SDNPSideEffect]>;
+def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
@@ -314,6 +317,16 @@ def X86MemVY64Operand : AsmOperandClass {
let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
}
+def X86MemVZ64Operand : AsmOperandClass {
+ let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+ let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+ let Name = "Mem512"; let PredicateMethod = "isMem512";
+}
+
def X86AbsMemAsmOperand : AsmOperandClass {
let Name = "AbsMem";
let SuperClasses = [X86MemAsmOperand];
@@ -342,6 +355,8 @@ def i128mem : X86MemOperand<"printi128mem"> {
let ParserMatchClass = X86Mem128AsmOperand; }
def i256mem : X86MemOperand<"printi256mem"> {
let ParserMatchClass = X86Mem256AsmOperand; }
+def i512mem : X86MemOperand<"printi512mem"> {
+ let ParserMatchClass = X86Mem512AsmOperand; }
def f32mem : X86MemOperand<"printf32mem"> {
let ParserMatchClass = X86Mem32AsmOperand; }
def f64mem : X86MemOperand<"printf64mem"> {
@@ -352,6 +367,12 @@ def f128mem : X86MemOperand<"printf128mem"> {
let ParserMatchClass = X86Mem128AsmOperand; }
def f256mem : X86MemOperand<"printf256mem">{
let ParserMatchClass = X86Mem256AsmOperand; }
+def f512mem : X86MemOperand<"printf512mem">{
+ let ParserMatchClass = X86Mem512AsmOperand; }
+def v512mem : Operand<iPTR> {
+ let PrintMethod = "printf512mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86Mem512AsmOperand; }
// Gather mem operands
def vx32mem : X86MemOperand<"printi32mem">{
@@ -366,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{
def vy64mem : X86MemOperand<"printi64mem">{
let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
let ParserMatchClass = X86MemVY64Operand; }
+def vy64xmem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY64Operand; }
+def vz32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86MemVZ32Operand; }
+def vz64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86MemVZ64Operand; }
}
// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -520,8 +550,7 @@ def i64i8imm : Operand<i64> {
def lea64_32mem : Operand<i32> {
let PrintMethod = "printi32mem";
- let AsmOperandLowerMethod = "lower_lea64_32mem";
- let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
let ParserMatchClass = X86MemAsmOperand;
}
@@ -543,7 +572,7 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
[add, sub, mul, X86mul_imm, shl, or, frameindex],
[]>;
// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
-def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
[add, sub, mul, X86mul_imm, shl, or,
frameindex, X86WrapperRIP],
[]>;
@@ -588,11 +617,19 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
+def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">;
+def HasEMI : Predicate<"Subtarget->hasERI()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
@@ -603,7 +640,12 @@ def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasHLE : Predicate<"Subtarget->hasHLE()">;
+def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
def HasADX : Predicate<"Subtarget->hasADX()">;
+def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -626,6 +668,7 @@ def OptForSize : Predicate<"OptForSize">;
def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.
@@ -758,7 +801,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
//
// Nop
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
"nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
@@ -769,8 +812,9 @@ let neverHasSideEffects = 1 in {
// Constructing a stack frame.
def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
- "enter\t$len, $lvl", [], IIC_ENTER>;
+ "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+let SchedRW = [WriteALU] in {
let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
def LEAVE : I<0xC9, RawFrm,
(outs), (ins), "leave", [], IIC_LEAVE>,
@@ -780,32 +824,33 @@ let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
def LEAVE64 : I<0xC9, RawFrm,
(outs), (ins), "leave", [], IIC_LEAVE>,
Requires<[In64BitMode]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//
let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
-let mayLoad = 1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
IIC_POP_REG16>, OpSize;
def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
IIC_POP_REG>;
def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
IIC_POP_REG>, OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
IIC_POP_MEM>, OpSize;
def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
IIC_POP_REG>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
IIC_POP_MEM>;
def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
Requires<[In32BitMode]>;
-}
+} // mayLoad, SchedRW
-let mayStore = 1 in {
+let mayStore = 1, SchedRW = [WriteStore] in {
def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
IIC_PUSH_REG>, OpSize;
def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
@@ -832,29 +877,30 @@ def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
Requires<[In32BitMode]>;
-}
+} // mayStore, SchedRW
}
let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
-let mayLoad = 1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
def POP64r : I<0x58, AddRegFrm,
(outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
IIC_POP_REG>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
IIC_POP_MEM>;
-}
-let mayStore = 1 in {
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
def PUSH64r : I<0x50, AddRegFrm,
(outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
IIC_PUSH_REG>;
def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
IIC_PUSH_MEM>;
-}
+} // mayStore, SchedRW
}
-let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
+ SchedRW = [WriteStore] in {
def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
"push{q}\t$imm", [], IIC_PUSH_IMM>;
def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
@@ -865,23 +911,24 @@ def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode]>, Sched<[WriteLoad]>;
let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode]>, Sched<[WriteStore]>;
let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
- mayLoad=1, neverHasSideEffects=1 in {
-def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
+ mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
Requires<[In32BitMode]>;
}
let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
- mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
+ mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
Requires<[In32BitMode]>;
}
-let Constraints = "$src = $dst" in { // GR32 = bswap GR32
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
def BSWAP32r : I<0xC8, AddRegFrm,
(outs GR32:$dst), (ins GR32:$src),
"bswap{l}\t$dst",
@@ -890,60 +937,63 @@ def BSWAP32r : I<0xC8, AddRegFrm,
def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
"bswap{q}\t$dst",
[(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
// Bit scan instructions.
let Defs = [EFLAGS] in {
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
- IIC_BSF>, TB, OpSize;
+ IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
- IIC_BSF>, TB, OpSize;
+ IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+ Sched<[WriteShift]>;
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
- IIC_BSF>, TB;
+ IIC_BSF>, TB, Sched<[WriteShiftLd]>;
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
- IIC_BSF>, TB;
+ IIC_BSF>, TB, Sched<[WriteShift]>;
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
- IIC_BSF>, TB;
+ IIC_BSF>, TB, Sched<[WriteShiftLd]>;
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
- TB, OpSize;
+ TB, OpSize, Sched<[WriteShift]>;
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
IIC_BSR>, TB,
- OpSize;
+ OpSize, Sched<[WriteShiftLd]>;
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+ Sched<[WriteShift]>;
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
- IIC_BSR>, TB;
+ IIC_BSR>, TB, Sched<[WriteShiftLd]>;
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB;
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+ Sched<[WriteShift]>;
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
- IIC_BSR>, TB;
+ IIC_BSR>, TB, Sched<[WriteShiftLd]>;
} // Defs = [EFLAGS]
-
+let SchedRW = [WriteMicrocoded] in {
// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
@@ -971,12 +1021,12 @@ def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
-
+} // SchedRW
//===----------------------------------------------------------------------===//
// Move Instructions.
//
-
+let SchedRW = [WriteMove] in {
let neverHasSideEffects = 1 in {
def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
"mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
@@ -987,6 +1037,7 @@ def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
}
+
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
@@ -1004,7 +1055,9 @@ def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
}
+} // SchedRW
+let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
[(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
@@ -1017,45 +1070,60 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a
/// 32-bit offset from the PC. These are only valid in x86-32 mode.
+let SchedRW = [WriteALU] in {
def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
- "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
- "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
- "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
- "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
- "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
- "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
+}
-// FIXME: These definitions are utterly broken
-// Just leave them commented out for now because they're useless outside
-// of the large code model, and most compilers won't generate the instructions
-// in question.
-/*
-def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
- "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
- "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
- "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
- "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-*/
-
-
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>,
+ Requires<[In64BitMode]>;
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
+ Requires<[In64BitMode]>;
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>,
+ Requires<[In64BitMode]>;
+def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{q}\t{$src, %rax|rax, $src}", []>,
+ Requires<[In64BitMode]>;
+
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>,
+ Requires<[In64BitMode]>;
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
+ Requires<[In64BitMode]>;
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
+ Requires<[In64BitMode]>;
+def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
+ Requires<[In64BitMode]>;
+
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
"mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
@@ -1066,7 +1134,7 @@ def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
}
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
[(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
@@ -1081,6 +1149,7 @@ def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
[(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
}
+let SchedRW = [WriteStore] in {
def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
[(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
@@ -1093,6 +1162,7 @@ def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
// that they can be used for copying and storing h registers, which can't be
@@ -1101,34 +1171,37 @@ let isCodeGenOnly = 1 in {
let neverHasSideEffects = 1 in
def MOV8rr_NOREX : I<0x88, MRMDestReg,
(outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>;
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>,
+ Sched<[WriteMove]>;
let mayStore = 1 in
def MOV8mr_NOREX : I<0x88, MRMDestMem,
(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
"mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
- IIC_MOV_MEM>;
+ IIC_MOV_MEM>, Sched<[WriteStore]>;
let mayLoad = 1, neverHasSideEffects = 1,
canFoldAsLoad = 1, isReMaterializable = 1 in
def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
"mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
- IIC_MOV_MEM>;
+ IIC_MOV_MEM>, Sched<[WriteLoad]>;
}
// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
let Defs = [EFLAGS], Uses = [AH] in
def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
[(set EFLAGS, (X86sahf AH))], IIC_AHF>;
let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
IIC_AHF>; // AH = flags
-
+} // SchedRW
//===----------------------------------------------------------------------===//
// Bit tests instructions: BT, BTS, BTR, BTC.
let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
@@ -1139,13 +1212,14 @@ def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
// Unlike with the register+register form, the memory+register form of the
// bt instruction does not ignore the high bits of the index. From ISel's
// perspective, this is pretty bizarre. Make these instructions disassembly
// only for now.
-let mayLoad = 1, hasSideEffects = 0 in {
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
// [(X86bt (loadi16 addr:$src1), GR16:$src2),
@@ -1166,6 +1240,7 @@ let mayLoad = 1, hasSideEffects = 0 in {
>, TB;
}
+let SchedRW = [WriteALU] in {
def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
@@ -1178,10 +1253,12 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
IIC_BT_RI>, TB;
+} // SchedRW
// Note that these instructions don't need FastBTMem because that
// only applies when the other operand is in a register. When it's
// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
@@ -1194,8 +1271,10 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
OpSize, TB;
@@ -1203,8 +1282,9 @@ def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1214,6 +1294,7 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
}
+let SchedRW = [WriteALU] in {
def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1221,8 +1302,9 @@ def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1232,6 +1314,7 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
}
+let SchedRW = [WriteALU] in {
def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
OpSize, TB;
@@ -1239,8 +1322,9 @@ def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1250,6 +1334,7 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
}
+let SchedRW = [WriteALU] in {
def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1257,8 +1342,9 @@ def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1268,6 +1354,7 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
}
+let SchedRW = [WriteALU] in {
def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
OpSize, TB;
@@ -1275,8 +1362,9 @@ def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1286,6 +1374,7 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
}
+let SchedRW = [WriteALU] in {
def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1293,8 +1382,9 @@ def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1315,7 +1405,7 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
// operand is referenced, the atomicity is ensured.
multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
InstrItinClass itin> {
- let Constraints = "$val = $dst" in {
+ let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
(ins GR8:$val, i8mem:$ptr),
!strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
@@ -1350,6 +1440,7 @@ multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
// Swap between registers.
+let SchedRW = [WriteALU] in {
let Constraints = "$val = $dst" in {
def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
"xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
@@ -1363,20 +1454,20 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
// Swap between EAX and other registers.
def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
- "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
+ "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize;
def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
Requires<[In32BitMode]>;
// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
Requires<[In64BitMode]>;
def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
- "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
-
-
+ "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+let SchedRW = [WriteALU] in {
def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
"xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
@@ -1386,8 +1477,9 @@ def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
"xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
"xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
-let mayLoad = 1, mayStore = 1 in {
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
"xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
@@ -1400,6 +1492,7 @@ def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
}
+let SchedRW = [WriteALU] in {
def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
"cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
IIC_CMPXCHG_REG8>, TB;
@@ -1412,7 +1505,9 @@ def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
"cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+let SchedRW = [WriteALULd, WriteRMW] in {
let mayLoad = 1, mayStore = 1 in {
def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
"cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
@@ -1436,7 +1531,7 @@ let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
TB, Requires<[HasCmpxchg16b]>;
-
+} // SchedRW
// Lock instruction prefix
@@ -1459,17 +1554,21 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
+}
+let SchedRW = [WriteSystem] in {
def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
-
+}
// Flag instructions
+let SchedRW = [WriteALU] in {
def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
@@ -1479,10 +1578,13 @@ def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
// Table lookup instructions
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+ Sched<[WriteLoad]>;
+let SchedRW = [WriteMicrocoded] in {
// ASCII Adjust After Addition
// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
@@ -1512,7 +1614,9 @@ def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
Requires<[In32BitMode]>;
+} // SchedRW
+let SchedRW = [WriteSystem] in {
// Check Array Index Against Bounds
def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
@@ -1528,11 +1632,13 @@ def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
Requires<[In32BitMode]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// MOVBE Instructions
//
let Predicates = [HasMOVBE] in {
+ let SchedRW = [WriteALULd] in {
def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
@@ -1545,6 +1651,8 @@ let Predicates = [HasMOVBE] in {
"movbe{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
T8;
+ }
+ let SchedRW = [WriteStore] in {
def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
[(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
@@ -1557,6 +1665,7 @@ let Predicates = [HasMOVBE] in {
"movbe{q}\t{$src, $dst|$dst, $src}",
[(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
T8;
+ }
}
//===----------------------------------------------------------------------===//
@@ -1575,6 +1684,21 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
}
//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+ def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+ "rdseed{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB;
+ def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+ "rdseed{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB;
+ def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+ "rdseed{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
// LZCNT Instruction
//
let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
@@ -1737,6 +1861,7 @@ include "X86InstrXOP.td"
// SSE, MMX and 3DNow! vector support.
include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
include "X86InstrMMX.td"
include "X86Instr3DNow.td"
@@ -1755,115 +1880,120 @@ include "X86InstrCompiler.td"
// Assembler Mnemonic Aliases
//===----------------------------------------------------------------------===//
-def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"cbw", "cbtw">;
-def : MnemonicAlias<"cwde", "cwtl">;
-def : MnemonicAlias<"cwd", "cwtd">;
-def : MnemonicAlias<"cdq", "cltd">;
-def : MnemonicAlias<"cdqe", "cltq">;
-def : MnemonicAlias<"cqo", "cqto">;
+def : MnemonicAlias<"cbw", "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd", "cwtd", "att">;
+def : MnemonicAlias<"cdq", "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo", "cqto", "att">;
// lret maps to lretl, it is not ambiguous with lretq.
-def : MnemonicAlias<"lret", "lretl">;
+def : MnemonicAlias<"lret", "lretl", "att">;
-def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"loopz", "loope">;
-def : MnemonicAlias<"loopnz", "loopne">;
+def : MnemonicAlias<"loopz", "loope", "att">;
+def : MnemonicAlias<"loopnz", "loopne", "att">;
-def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"popf", "popfq">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"popfd", "popfl">;
+def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
// all modes. However: "push (addr)" and "push $42" should default to
// pushl/pushq depending on the current mode. Similar for "pop %bx"
-def : MnemonicAlias<"push", "pushl">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"push", "pushq">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"pushfd", "pushfl">;
+def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad", "popa", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushad", "pusha", "intel">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"repe", "rep">;
-def : MnemonicAlias<"repz", "rep">;
-def : MnemonicAlias<"repnz", "repne">;
+def : MnemonicAlias<"repe", "rep", "att">;
+def : MnemonicAlias<"repz", "rep", "att">;
+def : MnemonicAlias<"repnz", "repne", "att">;
-def : MnemonicAlias<"retl", "ret">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"retq", "ret">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"retl", "ret", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retq", "ret", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"salb", "shlb">;
-def : MnemonicAlias<"salw", "shlw">;
-def : MnemonicAlias<"sall", "shll">;
-def : MnemonicAlias<"salq", "shlq">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
-def : MnemonicAlias<"smovb", "movsb">;
-def : MnemonicAlias<"smovw", "movsw">;
-def : MnemonicAlias<"smovl", "movsl">;
-def : MnemonicAlias<"smovq", "movsq">;
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
-def : MnemonicAlias<"ud2a", "ud2">;
-def : MnemonicAlias<"verrw", "verr">;
+def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
// System instruction aliases.
-def : MnemonicAlias<"iret", "iretl">;
-def : MnemonicAlias<"sysret", "sysretl">;
-def : MnemonicAlias<"sysexit", "sysexitl">;
+def : MnemonicAlias<"iret", "iretl", "att">;
+def : MnemonicAlias<"sysret", "sysretl", "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
-def : MnemonicAlias<"lgdtl", "lgdt">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"lgdtq", "lgdt">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"lidtl", "lidt">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"lidtq", "lidt">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"sgdtl", "sgdt">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"sgdtq", "sgdt">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"sidtl", "sidt">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"sidtq", "sidt">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lgdtl", "lgdt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdtq", "lgdt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidtl", "lidt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidtq", "lidt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdtl", "sgdt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdtq", "sgdt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidtl", "sidt", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidtq", "sidt", "att">, Requires<[In64BitMode]>;
// Floating point stack aliases.
-def : MnemonicAlias<"fcmovz", "fcmove">;
-def : MnemonicAlias<"fcmova", "fcmovnbe">;
-def : MnemonicAlias<"fcmovnae", "fcmovb">;
-def : MnemonicAlias<"fcmovna", "fcmovbe">;
-def : MnemonicAlias<"fcmovae", "fcmovnb">;
-def : MnemonicAlias<"fcomip", "fcompi">;
-def : MnemonicAlias<"fildq", "fildll">;
-def : MnemonicAlias<"fistpq", "fistpll">;
-def : MnemonicAlias<"fisttpq", "fisttpll">;
-def : MnemonicAlias<"fldcww", "fldcw">;
-def : MnemonicAlias<"fnstcww", "fnstcw">;
-def : MnemonicAlias<"fnstsww", "fnstsw">;
-def : MnemonicAlias<"fucomip", "fucompi">;
-def : MnemonicAlias<"fwait", "wait">;
-
-
-class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond>
+def : MnemonicAlias<"fcmovz", "fcmove", "att">;
+def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
+def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
+def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
+def : MnemonicAlias<"fcomip", "fcompi", "att">;
+def : MnemonicAlias<"fildq", "fildll", "att">;
+def : MnemonicAlias<"fistpq", "fistpll", "att">;
+def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
+def : MnemonicAlias<"fldcww", "fldcw", "att">;
+def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
+def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
+def : MnemonicAlias<"fucomip", "fucompi", "att">;
+def : MnemonicAlias<"fwait", "wait", "att">;
+
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+ string VariantName>
: MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
- !strconcat(Prefix, NewCond, Suffix)>;
+ !strconcat(Prefix, NewCond, Suffix), VariantName>;
/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
/// example "setz" -> "sete".
-multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> {
- def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb
- def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete
- def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe
- def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae
- def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae
- def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle
- def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge
- def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne
- def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp
- def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp
-
- def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb
- def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta
- def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl
- def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+ string V = ""> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
}
// Aliases for set<CC>
@@ -1871,9 +2001,11 @@ defm : IntegerCondCodeMnemonicAlias<"set", "">;
// Aliases for j<CC>
defm : IntegerCondCodeMnemonicAlias<"j", "">;
// Aliases for cmov<CC>{w,l,q}
-defm : IntegerCondCodeMnemonicAlias<"cmov", "w">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "l">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "q">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
//===----------------------------------------------------------------------===//
@@ -1885,75 +2017,83 @@ def : InstAlias<"aad", (AAD8i8 10)>;
def : InstAlias<"aam", (AAM8i8 10)>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
-def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>;
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+ (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+ (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+ (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+ (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
// clr aliases.
-def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>;
-def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
-def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
-def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
// div and idiv aliases for explicit A register.
-def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>;
-def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>;
-def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>;
-def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>;
-def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>;
-def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>;
-def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>;
-def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
// Various unary fpstack operations default to operating on on ST1.
// For example, "fxch" -> "fxch %st(1)"
def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
-def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>;
-def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>;
-def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>;
-def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>;
-def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>;
-def : InstAlias<"fxch", (XCH_F ST1)>;
-def : InstAlias<"fcom", (COM_FST0r ST1)>;
-def : InstAlias<"fcomp", (COMP_FST0r ST1)>;
-def : InstAlias<"fcomi", (COM_FIr ST1)>;
-def : InstAlias<"fcompi", (COM_FIPr ST1)>;
-def : InstAlias<"fucom", (UCOM_Fr ST1)>;
-def : InstAlias<"fucomp", (UCOM_FPr ST1)>;
-def : InstAlias<"fucomi", (UCOM_FIr ST1)>;
-def : InstAlias<"fucompi", (UCOM_FIPr ST1)>;
+def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
+def : InstAlias<"fxch", (XCH_F ST1), 0>;
+def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
+def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
+def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
+def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
// gas.
multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
- def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
(Inst RST:$op), EmitAlias>;
- def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
(Inst ST0), EmitAlias>;
}
defm : FpUnaryAlias<"fadd", ADD_FST0r>;
defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
defm : FpUnaryAlias<"fsub", SUB_FST0r>;
-defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
-defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
defm : FpUnaryAlias<"fmul", MUL_FST0r>;
defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
-defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
-defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
defm : FpUnaryAlias<"fcompi", COM_FIPr>;
@@ -1963,16 +2103,16 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
// solely because gas supports it.
-def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
-def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
-def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
-def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
-def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
-def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
// We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
def : InstAlias<"fnstsw" , (FNSTSW16r)>;
// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
@@ -1991,12 +2131,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
// inb %dx -> inb %al, %dx
-def : InstAlias<"inb %dx", (IN8rr)>;
-def : InstAlias<"inw %dx", (IN16rr)>;
-def : InstAlias<"inl %dx", (IN32rr)>;
-def : InstAlias<"inb $port", (IN8ri i8imm:$port)>;
-def : InstAlias<"inw $port", (IN16ri i8imm:$port)>;
-def : InstAlias<"inl $port", (IN32ri i8imm:$port)>;
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
@@ -2024,7 +2164,7 @@ def : InstAlias<"movq $src, $dst",
// movsd with no operands (as opposed to the SSE scalar move of a double) is an
// alias for movsl. (as in rep; movsd)
-def : InstAlias<"movsd", (MOVSD)>;
+def : InstAlias<"movsd", (MOVSD), 0>;
// movsx aliases
def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
@@ -2045,12 +2185,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
// Note: No GR32->GR64 movzx form.
// outb %dx -> outb %al, %dx
-def : InstAlias<"outb %dx", (OUT8rr)>;
-def : InstAlias<"outw %dx", (OUT16rr)>;
-def : InstAlias<"outl %dx", (OUT32rr)>;
-def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>;
-def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>;
-def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
@@ -2058,19 +2198,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
// shld/shrd op,op -> shld op, op, CL
-def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
-def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
-
-def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
-def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
/* FIXME: This is disabled because the asm matcher is currently incapable of
* matching a fixed immediate like $1.
@@ -2101,19 +2241,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
FIXME */
// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 127af6f..cb12956 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -20,6 +20,7 @@
// MMX Multiclasses
//===----------------------------------------------------------------------===//
+let Sched = WriteVecALU in {
def MMX_INTALU_ITINS : OpndItins<
IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
>;
@@ -35,11 +36,14 @@ def MMX_PHADDSUBW : OpndItins<
def MMX_PHADDSUBD : OpndItins<
IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
>;
+}
+let Sched = WriteVecIMul in
def MMX_PMUL_ITINS : OpndItins<
IIC_MMX_PMUL, IIC_MMX_PMUL
>;
+let Sched = WriteVecALU in {
def MMX_PSADBW_ITINS : OpndItins<
IIC_MMX_PSADBW, IIC_MMX_PSADBW
>;
@@ -47,11 +51,13 @@ def MMX_PSADBW_ITINS : OpndItins<
def MMX_MISC_FUNC_ITINS : OpndItins<
IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
>;
+}
def MMX_SHIFT_ITINS : ShiftOpndItins<
IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
>;
+let Sched = WriteShuffle in {
def MMX_UNPCK_H_ITINS : OpndItins<
IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
>;
@@ -67,7 +73,9 @@ def MMX_PCK_ITINS : OpndItins<
def MMX_PSHUF_ITINS : OpndItins<
IIC_MMX_PSHUF, IIC_MMX_PSHUF
>;
+} // Sched
+let Sched = WriteCvtF2I in {
def MMX_CVT_PD_ITINS : OpndItins<
IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
>;
@@ -75,6 +83,7 @@ def MMX_CVT_PD_ITINS : OpndItins<
def MMX_CVT_PS_ITINS : OpndItins<
IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
>;
+}
let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
@@ -84,7 +93,8 @@ let Constraints = "$src1 = $dst" in {
def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> {
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]> {
let isCommutable = Commutable;
}
def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
@@ -92,7 +102,7 @@ let Constraints = "$src1 = $dst" in {
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
(bitconvert (load_mmx addr:$src2))))],
- itins.rm>;
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
@@ -101,17 +111,19 @@ let Constraints = "$src1 = $dst" in {
def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>;
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[WriteVecShift]>;
def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
(bitconvert (load_mmx addr:$src2))))],
- itins.rm>;
+ itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>;
+ [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>,
+ Sched<[WriteVecShift]>;
}
}
@@ -120,13 +132,14 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
Intrinsic IntId64, OpndItins itins> {
def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
+ [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR64:$dst,
(IntId64 (bitconvert (memopmmx addr:$src))))],
- itins.rm>;
+ itins.rm>, Sched<[itins.Sched.Folded]>;
}
/// Binary MMX instructions requiring SSSE3.
@@ -137,13 +150,15 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst,
(IntId64 VR64:$src1,
- (bitconvert (memopmmx addr:$src2))))], itins.rm>;
+ (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -164,21 +179,24 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
string asm, OpndItins itins, Domain d> {
def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>;
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+ Sched<[itins.Sched]>;
def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>;
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+ Sched<[itins.Sched.Folded]>;
}
multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
PatFrag ld_frag, string asm, Domain d> {
- def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
- asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
- IIC_DEFAULT, d>;
- def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
- (ins DstRC:$src1, x86memop:$src2), asm,
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
- IIC_DEFAULT, d>;
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins DstRC:$src1, SrcRC:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ NoItinerary, d>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ NoItinerary, d>;
}
//===----------------------------------------------------------------------===//
@@ -197,16 +215,17 @@ def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (scalar_to_vector GR32:$src)))],
- IIC_MMX_MOV_MM_RM>;
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
let canFoldAsLoad = 1 in
def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (scalar_to_vector (loadi32 addr:$src))))],
- IIC_MMX_MOV_MM_RM>;
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
let mayStore = 1 in
def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
- "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+ Sched<[WriteStore]>;
// Low word of MMX to GPR.
def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
@@ -214,16 +233,18 @@ def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst,
- (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>;
+ (MMX_X86movd2w (x86mmx VR64:$src)))],
+ IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
let neverHasSideEffects = 1 in
def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
"movd\t{$src, $dst|$dst, $src}",
- [], IIC_MMX_MOV_MM_RM>;
+ [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
// These are 64 bit moves, but since the OS X assembler doesn't
// recognize a register-register movq, we write them as
// movd.
+let SchedRW = [WriteMove] in {
def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
(outs GR64:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
@@ -237,6 +258,9 @@ let neverHasSideEffects = 1 in
def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
"movq\t{$src, $dst|$dst, $src}", [],
IIC_MMX_MOVQ_RR>;
+} // SchedRW
+
+let SchedRW = [WriteLoad] in {
let canFoldAsLoad = 1 in
def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
@@ -246,7 +270,9 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (x86mmx VR64:$src), addr:$dst)],
IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteMove] in {
def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
@@ -271,11 +297,12 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[], IIC_MMX_MOVQ_RR>;
+} // SchedRW
def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movntq\t{$src, $dst|$dst, $src}",
[(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
- IIC_MMX_MOVQ_RM>;
+ IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
let AddedComplexity = 15 in
// movd to MMX register zero-extends
@@ -283,7 +310,7 @@ def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
- IIC_MMX_MOV_MM_RM>;
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
let AddedComplexity = 20 in
def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
(ins i32mem:$src),
@@ -291,7 +318,7 @@ def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
[(set VR64:$dst,
(x86mmx (X86vzmovl (x86mmx
(scalar_to_vector (loadi32 addr:$src))))))],
- IIC_MMX_MOV_MM_RM>;
+ IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
// Arithmetic Instructions
defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
@@ -331,21 +358,21 @@ defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
MMX_INTALU_ITINS>;
defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
- MMX_INTALUQ_ITINS, 1>;
+ MMX_INTALUQ_ITINS>;
defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
- MMX_INTALU_ITINS, 1>;
+ MMX_INTALU_ITINS>;
defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
MMX_PHADDSUBW>;
@@ -491,14 +518,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
- IIC_MMX_PSHUF>;
+ IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
imm:$src2))],
- IIC_MMX_PSHUF>;
+ IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
@@ -532,7 +559,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
(iPTR imm:$src2)))],
- IIC_MMX_PEXTR>;
+ IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
let Constraints = "$src1 = $dst" in {
def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
(outs VR64:$dst),
@@ -540,7 +567,7 @@ let Constraints = "$src1 = $dst" in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
GR32:$src2, (iPTR imm:$src3)))],
- IIC_MMX_PINSRW>;
+ IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
@@ -549,7 +576,7 @@ let Constraints = "$src1 = $dst" in {
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
(i32 (anyext (loadi16 addr:$src2))),
(iPTR imm:$src3)))],
- IIC_MMX_PINSRW>;
+ IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
// Mask creation
@@ -570,6 +597,7 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
(x86mmx (MMX_MOVQ64rm addr:$src))>;
// Misc.
+let SchedRW = [WriteShuffle] in {
let Uses = [EDI] in
def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
@@ -580,6 +608,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
[(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
IIC_MMX_MASKMOV>;
+}
// 64-bit bit convert.
let Predicates = [HasSSE2] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 105963f..a86006a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -35,6 +35,7 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
// scalar
+let Sched = WriteFAdd in {
def SSE_ALU_F32S : OpndItins<
IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
>;
@@ -42,6 +43,7 @@ def SSE_ALU_F32S : OpndItins<
def SSE_ALU_F64S : OpndItins<
IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
>;
+}
def SSE_ALU_ITINS_S : SizeItins<
SSE_ALU_F32S, SSE_ALU_F64S
@@ -76,6 +78,7 @@ def SSE_DIV_ITINS_S : SizeItins<
>;
// parallel
+let Sched = WriteFAdd in {
def SSE_ALU_F32P : OpndItins<
IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
>;
@@ -83,6 +86,7 @@ def SSE_ALU_F32P : OpndItins<
def SSE_ALU_F64P : OpndItins<
IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
>;
+}
def SSE_ALU_ITINS_P : SizeItins<
SSE_ALU_F32P, SSE_ALU_F64P
@@ -184,14 +188,16 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))], itins.rr>;
+ RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, mem_cpat:$src2))], itins.rm>;
+ RC:$src1, mem_cpat:$src2))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
/// sse12_fp_packed - SSE 1 & 2 packed instructions class
@@ -226,13 +232,13 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rr, IIC_DEFAULT, d>,
+ pat_rr, NoItinerary, d>,
Sched<[WriteVecLogic]>;
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rm, IIC_DEFAULT, d>,
+ pat_rm, NoItinerary, d>,
Sched<[WriteVecLogicLd, ReadAfterLd]>;
}
@@ -364,7 +370,7 @@ let Predicates = [HasAVX] in {
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1 in {
+ isPseudo = 1, SchedRW = [WriteZero] in {
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
@@ -381,7 +387,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1 in {
+ isPseudo = 1, SchedRW = [WriteZero] in {
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4f32 immAllZerosV))]>;
}
@@ -398,7 +404,7 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
// at the rename stage without using any execution unit, so SET0PSY
// and SET0PDY can be used for vector int instructions without penalty
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1, Predicates = [HasAVX] in {
+ isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8f32 immAllZerosV))]>;
}
@@ -436,7 +442,7 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1 in {
+ isPseudo = 1, SchedRW = [WriteZero] in {
def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
let Predicates = [HasAVX2] in
@@ -470,7 +476,7 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, RC:$src2),
!strconcat(base_opc, asm_opr),
- [], IIC_SSE_MOV_S_RR>;
+ [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
}
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
@@ -848,7 +854,7 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
} // SchedRW
// For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
@@ -924,7 +930,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
} // SchedRW
// For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
@@ -1070,7 +1076,7 @@ let Predicates = [UseSSE1] in {
// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
@@ -1087,7 +1093,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
let isCodeGenOnly = 1 in {
def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
@@ -1436,11 +1442,13 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
X86MemOperand x86memop, string asm> {
let neverHasSideEffects = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2F]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+ Sched<[WriteCvtI2FLd, ReadAfterLd]>;
} // neverHasSideEffects = 1
}
@@ -1740,13 +1748,15 @@ let neverHasSideEffects = 1 in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR64:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG;
+ IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR64:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RM>,
- XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
+ XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
@@ -1755,26 +1765,28 @@ def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fround FR64:$src))],
- IIC_SSE_CVT_Scalar_RR>;
+ IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fround (loadf64 addr:$src)))],
IIC_SSE_CVT_Scalar_RM>,
XD,
- Requires<[UseSSE2, OptForSize]>;
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>;
+ IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
VR128:$src1, sse_load_f64:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>;
+ IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in {
def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
@@ -1782,13 +1794,15 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
+ IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
VR128:$src1, sse_load_f64:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
+ IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
// Convert scalar single to scalar double
@@ -1798,13 +1812,15 @@ def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR32:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RR>,
- XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
+ XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR32:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RM>,
- XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
+ XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
def : Pat<(f64 (fextend FR32:$src)),
@@ -1823,12 +1839,12 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (fextend FR32:$src))],
IIC_SSE_CVT_Scalar_RR>, XS,
- Requires<[UseSSE2]>;
+ Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (extloadf32 addr:$src))],
IIC_SSE_CVT_Scalar_RM>, XS,
- Requires<[UseSSE2, OptForSize]>;
+ Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
// extload f32 -> f64. This matches load+fextend because we have a hack in
// the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1845,57 +1861,61 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>;
+ IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2F]>;
def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>;
+ IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
- IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
+ IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2F]>;
def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
- IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
+ IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>, VEX;
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX;
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvt_ps2dq_256 VR256:$src))],
- IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>;
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>;
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
// Convert Packed Double FP to Packed DW Integers
@@ -1906,7 +1926,7 @@ let Predicates = [HasAVX] in {
def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
- VEX;
+ VEX, Sched<[WriteCvtF2I]>;
// XMM only
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -1914,18 +1934,20 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX;
+ (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L;
+ (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtF2I]>;
def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
- VEX, VEX_L;
+ VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
(VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
}
@@ -1934,11 +1956,11 @@ def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>;
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>;
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
@@ -1946,32 +1968,33 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>, VEX;
+ IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
(memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX;
+ IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
- IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
+ IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
(memopv8f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
+ IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+ Sched<[WriteCvtF2ILd]>;
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>;
+ IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PS_RM>;
+ IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
@@ -2021,7 +2044,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>, VEX;
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
@@ -2034,19 +2057,19 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dqx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
(memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX;
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
- IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -2060,12 +2083,13 @@ let Predicates = [HasAVX] in {
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>;
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
(memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>;
+ IIC_SSE_CVT_PD_RM>,
+ Sched<[WriteCvtF2ILd]>;
// Convert packed single to packed double
let Predicates = [HasAVX] in {
@@ -2073,32 +2097,32 @@ let Predicates = [HasAVX] in {
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>, TB, VEX;
+ IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, TB, VEX;
+ IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
- IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
}
let Predicates = [UseSSE2] in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>, TB;
+ IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, TB;
+ IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>;
}
// Convert Packed DW Integers to Packed Double FP
@@ -2106,30 +2130,33 @@ let Predicates = [HasAVX] in {
let neverHasSideEffects = 1, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX;
+ []>, VEX, Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX;
+ (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+ Sched<[WriteCvtI2F]>;
def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvtdq2_pd_256
- (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L;
+ (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L;
+ (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+ Sched<[WriteCvtI2F]>;
}
let neverHasSideEffects = 1, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>;
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
- IIC_SSE_CVT_PD_RM>;
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
// AVX 256-bit register conversion intrinsics
let Predicates = [HasAVX] in {
@@ -2146,7 +2173,7 @@ let Predicates = [HasAVX] in {
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
- IIC_SSE_CVT_PD_RR>, VEX;
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
// XMM only
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2155,31 +2182,31 @@ def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2psx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX;
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
// YMM only
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
- IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
(VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
- IIC_SSE_CVT_PD_RR>;
+ IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>;
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
// AVX 256-bit register conversion intrinsics
@@ -2244,11 +2271,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
let neverHasSideEffects = 1 in {
def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
- IIC_SSE_ALU_F32S_RR>;
+ IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
- IIC_SSE_ALU_F32S_RM>;
+ IIC_SSE_ALU_F32S_RM>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -2314,65 +2342,62 @@ let Constraints = "$src1 = $dst" in {
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, X86MemOperand x86memop,
- PatFrag ld_frag, string OpcodeStr, Domain d> {
- def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ PatFrag ld_frag, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
- IIC_SSE_COMIS_RR, d>,
+ IIC_SSE_COMIS_RR>,
Sched<[WriteFAdd]>;
- def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
(ld_frag addr:$src2)))],
- IIC_SSE_COMIS_RM, d>,
+ IIC_SSE_COMIS_RM>,
Sched<[WriteFAddLd, ReadAfterLd]>;
}
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
+ "ucomiss">, TB, VEX, VEX_LIG;
defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
- VEX_LIG;
+ "ucomisd">, TB, OpSize, VEX, VEX_LIG;
let Pattern = []<dag> in {
defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
- "comiss", SSEPackedSingle>, TB, VEX,
- VEX_LIG;
+ "comiss">, TB, VEX, VEX_LIG;
defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
- "comisd", SSEPackedDouble>, TB, OpSize, VEX,
- VEX_LIG;
+ "comisd">, TB, OpSize, VEX, VEX_LIG;
}
defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss", SSEPackedSingle>, TB, VEX;
+ load, "ucomiss">, TB, VEX;
defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
+ load, "ucomisd">, TB, OpSize, VEX;
defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
- load, "comiss", SSEPackedSingle>, TB, VEX;
+ load, "comiss">, TB, VEX;
defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
- load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
+ load, "comisd">, TB, OpSize, VEX;
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", SSEPackedSingle>, TB;
+ "ucomiss">, TB;
defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", SSEPackedDouble>, TB, OpSize;
+ "ucomisd">, TB, OpSize;
let Pattern = []<dag> in {
defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
- "comiss", SSEPackedSingle>, TB;
+ "comiss">, TB;
defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
- "comisd", SSEPackedDouble>, TB, OpSize;
+ "comisd">, TB, OpSize;
}
defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss", SSEPackedSingle>, TB;
+ load, "ucomiss">, TB;
defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+ load, "ucomisd">, TB, OpSize;
defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
- "comiss", SSEPackedSingle>, TB;
+ "comiss">, TB;
defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
- "comisd", SSEPackedDouble>, TB, OpSize;
+ "comisd">, TB, OpSize;
} // Defs = [EFLAGS]
// sse12_cmp_packed - sse 1 & 2 compare packed instructions
@@ -2394,10 +2419,11 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
let neverHasSideEffects = 1 in {
def rri_alt : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
- asm_alt, [], IIC_SSE_CMPP_RR, d>;
+ asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
def rmi_alt : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
- asm_alt, [], IIC_SSE_CMPP_RM, d>;
+ asm_alt, [], IIC_SSE_CMPP_RM, d>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
}
}
@@ -2694,18 +2720,18 @@ let Predicates = [HasAVX] in {
// Assembler Only
def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
"movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
- SSEPackedSingle>, TB, VEX;
+ SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
"movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
SSEPackedDouble>, TB,
- OpSize, VEX;
+ OpSize, VEX, Sched<[WriteVecLogic]>;
def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
"movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
- SSEPackedSingle>, TB, VEX, VEX_L;
+ SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
"movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
SSEPackedDouble>, TB,
- OpSize, VEX, VEX_L;
+ OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
}
defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2817,8 +2843,8 @@ defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for,
defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
SSE_BIT_ITINS_P>;
-let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
- defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+let isCommutable = 0 in
+ defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
SSE_BIT_ITINS_P>;
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
@@ -3020,12 +3046,20 @@ let isCodeGenOnly = 1 in {
/// And, we have a special variant form for a full-vector intrinsic form.
let Sched = WriteFSqrt in {
-def SSE_SQRTP : OpndItins<
- IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+def SSE_SQRTPS : OpndItins<
+ IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+ IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+ IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
>;
-def SSE_SQRTS : OpndItins<
- IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+def SSE_SQRTSD : OpndItins<
+ IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
>;
}
@@ -3290,18 +3324,18 @@ let Predicates = [HasAVX] in {
// Square root.
defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
- SSE_SQRTS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+ SSE_SQRTSS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd,
- SSE_SQRTS>,
- sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
+ SSE_SQRTSD>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
- int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+ int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
@@ -3458,7 +3492,7 @@ def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
//===----------------------------------------------------------------------===//
// Prefetch intrinsic.
-let Predicates = [HasSSE1] in {
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
"prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
IIC_SSE_PREFETCH>, TB;
@@ -3473,6 +3507,8 @@ def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
IIC_SSE_PREFETCH>, TB;
}
+// FIXME: How should these memory instructions be modeled?
+let SchedRW = [WriteLoad] in {
// Flush cache
def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
"clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
@@ -3492,6 +3528,7 @@ def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
"mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
TB, Requires<[HasSSE2]>;
+} // SchedRW
def : Pat<(X86SFence), (SFENCE)>;
def : Pat<(X86LFence), (LFENCE)>;
@@ -3503,17 +3540,17 @@ def : Pat<(X86MFence), (MFENCE)>;
def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
- IIC_SSE_LDMXCSR>, VEX;
+ IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
- IIC_SSE_STMXCSR>, VEX;
+ IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
- IIC_SSE_LDMXCSR>;
+ IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
- IIC_SSE_STMXCSR>;
+ IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
//===---------------------------------------------------------------------===//
// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -4337,43 +4374,43 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Packed Double Int
//
-def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
VEX, Sched<[WriteMove]>;
-def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
IIC_SSE_MOVDQ>,
VEX, Sched<[WriteLoad]>;
-def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (scalar_to_vector GR64:$src)))],
IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
Sched<[WriteMove]>;
-def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (scalar_to_vector GR64:$src)))],
IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
@@ -4381,22 +4418,22 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Single Scalar
//
-def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))],
IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>,
VEX, Sched<[WriteLoad]>;
-def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))],
IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
@@ -4404,23 +4441,23 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int to Packed Double Int
//
-def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
Sched<[WriteMove]>;
-def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs),
+def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (vector_extract (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
VEX, Sched<[WriteLoad]>;
-def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
Sched<[WriteMove]>;
-def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (vector_extract (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)],
@@ -4430,14 +4467,14 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
// Move Packed Doubleword Int first element to Doubleword Int
//
let SchedRW = [WriteMove] in {
-def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
- "vmov{d|q}\t{$src, $dst|$dst, $src}",
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
(iPTR 0)))],
IIC_SSE_MOVD_ToGP>,
- TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
+ VEX;
-def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
(iPTR 0)))],
@@ -4452,11 +4489,11 @@ def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
VEX, Sched<[WriteLoad]>;
-def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))],
IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
@@ -4465,11 +4502,11 @@ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))],
IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4477,33 +4514,34 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
//===---------------------------------------------------------------------===//
// Move Scalar Single to Double Int
//
-def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))],
- IIC_SSE_MOVD_ToGP>, VEX;
-def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, VEX;
-def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))],
- IIC_SSE_MOVD_ToGP>;
-def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>;
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
//===---------------------------------------------------------------------===//
// Patterns and instructions to describe movd/movq to XMM register zero-extends
//
+let SchedRW = [WriteMove] in {
let AddedComplexity = 15 in {
-def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def VMOVZDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector GR32:$src)))))],
IIC_SSE_MOVDQ>, VEX;
-def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
[(set VR128:$dst, (v2i64 (X86vzmovl
(v2i64 (scalar_to_vector GR64:$src)))))],
@@ -4511,32 +4549,33 @@ def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
VEX, VEX_W;
}
let AddedComplexity = 15 in {
-def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def MOVZDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector GR32:$src)))))],
IIC_SSE_MOVDQ>;
-def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
[(set VR128:$dst, (v2i64 (X86vzmovl
(v2i64 (scalar_to_vector GR64:$src)))))],
IIC_SSE_MOVDQ>;
}
+} // SchedRW
-let AddedComplexity = 20 in {
-def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+let AddedComplexity = 20, SchedRW = [WriteLoad] in {
+def VMOVZDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86vzmovl (v4i32 (scalar_to_vector
(loadi32 addr:$src))))))],
IIC_SSE_MOVDQ>, VEX;
-def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86vzmovl (v4i32 (scalar_to_vector
(loadi32 addr:$src))))))],
IIC_SSE_MOVDQ>;
-}
+} // AddedComplexity, SchedRW
let Predicates = [HasAVX] in {
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
@@ -4585,6 +4624,8 @@ def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
//===---------------------------------------------------------------------===//
// Move Quadword Int to Packed Quadword Int
//
+
+let SchedRW = [WriteLoad] in {
def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -4596,31 +4637,35 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
(v2i64 (scalar_to_vector (loadi64 addr:$src))))],
IIC_SSE_MOVDQ>, XS,
Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // SchedRW
//===---------------------------------------------------------------------===//
// Move Packed Quadword Int to Quadword Int
//
-def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+let SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (vector_extract (v2i64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, VEX;
-def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (vector_extract (v2i64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>;
+} // SchedRW
//===---------------------------------------------------------------------===//
// Store / copy lower 64-bits of a XMM register.
//
-def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
-def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
+ Sched<[WriteStore]>;
+def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
- IIC_SSE_MOVDQ>;
+ IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
let AddedComplexity = 20 in
def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
@@ -4629,7 +4674,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
(v2i64 (X86vzmovl (v2i64 (scalar_to_vector
(loadi64 addr:$src))))))],
IIC_SSE_MOVDQ>,
- XS, VEX, Requires<[HasAVX]>;
+ XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
let AddedComplexity = 20 in
def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
@@ -4638,7 +4683,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
(v2i64 (X86vzmovl (v2i64 (scalar_to_vector
(loadi64 addr:$src))))))],
IIC_SSE_MOVDQ>,
- XS, Requires<[UseSSE2]>;
+ XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
let Predicates = [HasAVX], AddedComplexity = 20 in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
@@ -4668,6 +4713,7 @@ def : Pat<(v4i64 (X86vzload addr:$src)),
// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
// IA32 document. movq xmm1, xmm2 does clear the high bits.
//
+let SchedRW = [WriteVecLogic] in {
let AddedComplexity = 15 in
def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}",
@@ -4680,7 +4726,9 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
[(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
IIC_SSE_MOVQ_RR>,
XS, Requires<[UseSSE2]>;
+} // SchedRW
+let SchedRW = [WriteVecLogicLd] in {
let AddedComplexity = 20 in
def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
@@ -4696,6 +4744,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
IIC_SSE_MOVDQ>,
XS, Requires<[UseSSE2]>;
}
+} // SchedRW
let AddedComplexity = 20 in {
let Predicates = [HasAVX] in {
@@ -4713,26 +4762,30 @@ let AddedComplexity = 20 in {
}
// Instructions to match in the assembler
-def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+let SchedRW = [WriteMove] in {
+def VMOVQs64rr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"movq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVDQ>, VEX, VEX_W;
-def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVDQ>, VEX, VEX_W;
// Recognize "movd" with GR64 destination, but encode as a "movq"
-def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVDQ>, VEX, VEX_W;
+} // SchedRW
// Instructions for the disassembler
// xr = XMM register
// xm = mem64
+let SchedRW = [WriteMove] in {
let Predicates = [HasAVX] in
def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
+} // SchedRW
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
@@ -4743,11 +4796,11 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (vt (OpNode RC:$src)))],
- IIC_SSE_MOV_LH>;
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (OpNode (mem_frag addr:$src)))],
- IIC_SSE_MOV_LH>;
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
}
let Predicates = [HasAVX] in {
@@ -4803,25 +4856,27 @@ multiclass sse3_replicate_dfp<string OpcodeStr> {
let neverHasSideEffects = 1 in
def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [], IIC_SSE_MOV_LH>;
+ [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(v2f64 (X86Movddup
(scalar_to_vector (loadf64 addr:$src)))))],
- IIC_SSE_MOV_LH>;
+ IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
}
// FIXME: Merge with above classe when there're patterns for the ymm version
multiclass sse3_replicate_dfp_y<string OpcodeStr> {
def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>;
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+ Sched<[WriteShuffle]>;
def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
(v4f64 (X86Movddup
- (scalar_to_vector (loadf64 addr:$src)))))]>;
+ (scalar_to_vector (loadf64 addr:$src)))))]>,
+ Sched<[WriteShuffleLd]>;
}
let Predicates = [HasAVX] in {
@@ -4869,6 +4924,7 @@ let Predicates = [UseSSE3] in {
// SSE3 - Move Unaligned Integer
//===---------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
let Predicates = [HasAVX] in {
def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vlddqu\t{$src, $dst|$dst, $src}",
@@ -4882,6 +4938,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"lddqu\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
IIC_SSE_LDDQU>;
+}
//===---------------------------------------------------------------------===//
// SSE3 - Arithmetic
@@ -4895,13 +4952,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>;
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm : I<0xD0, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>;
+ [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
@@ -4938,14 +4997,15 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
- IIC_SSE_HADDSUB_RM>;
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
}
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
@@ -4953,14 +5013,15 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+ Sched<[WriteFAdd]>;
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
- IIC_SSE_HADDSUB_RM>;
+ IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
@@ -5009,7 +5070,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
- OpSize;
+ OpSize, Sched<[WriteVecALU]>;
def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
@@ -5017,7 +5078,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
[(set VR128:$dst,
(IntId128
(bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
- OpSize;
+ OpSize, Sched<[WriteVecALULd]>;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
@@ -5027,16 +5088,27 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
(ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst, (IntId256 VR256:$src))]>,
- OpSize;
+ OpSize, Sched<[WriteVecALU]>;
def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
(IntId256
- (bitconvert (memopv4i64 addr:$src))))]>, OpSize;
+ (bitconvert (memopv4i64 addr:$src))))]>, OpSize,
+ Sched<[WriteVecALULd]>;
}
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128:$src))>;
+def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>;
+def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>;
+def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>;
+
let Predicates = [HasAVX] in {
defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb",
int_x86_ssse3_pabs_b_128>, VEX;
@@ -5044,6 +5116,19 @@ let Predicates = [HasAVX] in {
int_x86_ssse3_pabs_w_128>, VEX;
defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd",
int_x86_ssse3_pabs_d_128>, VEX;
+
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (VPABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (VPABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (VPABSDrr128 VR128:$src)>;
}
let Predicates = [HasAVX2] in {
@@ -5053,6 +5138,19 @@ let Predicates = [HasAVX2] in {
int_x86_avx2_pabs_w>, VEX, VEX_L;
defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd",
int_x86_avx2_pabs_d>, VEX, VEX_L;
+
+ def : Pat<(xor
+ (bc_v4i64 (v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+ (VPABSBrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+ (VPABSWrr256 VR256:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+ (VPABSDrr256 VR256:$src)>;
}
defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
@@ -5062,10 +5160,26 @@ defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
int_x86_ssse3_pabs_d_128>;
+let Predicates = [HasSSSE3] in {
+ def : Pat<(xor
+ (bc_v2i64 (v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+ (PABSBrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+ (PABSWrr128 VR128:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+ (PABSDrr128 VR128:$src)>;
+}
+
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Binary Operator Instructions
//===---------------------------------------------------------------------===//
+let Sched = WriteVecALU in {
def SSE_PHADDSUBD : OpndItins<
IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
>;
@@ -5075,12 +5189,16 @@ def SSE_PHADDSUBSW : OpndItins<
def SSE_PHADDSUBW : OpndItins<
IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
>;
+}
+let Sched = WriteShuffle in
def SSE_PSHUFB : OpndItins<
IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
>;
+let Sched = WriteVecALU in
def SSE_PSIGN : OpndItins<
IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
>;
+let Sched = WriteVecIMul in
def SSE_PMULHRSW : OpndItins<
IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
>;
@@ -5097,7 +5215,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
- OpSize;
+ OpSize, Sched<[itins.Sched]>;
def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
@@ -5105,7 +5223,8 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
(OpVT (OpNode RC:$src1,
- (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize;
+ (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
@@ -5119,7 +5238,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
- OpSize;
+ OpSize, Sched<[itins.Sched]>;
def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!if(Is2Addr,
@@ -5127,7 +5246,8 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst,
(IntId128 VR128:$src1,
- (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+ (bitconvert (memopv2i64 addr:$src2))))]>, OpSize,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
@@ -5269,7 +5389,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [], IIC_SSE_PALIGNR>, OpSize;
+ [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
let mayLoad = 1 in
def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5277,7 +5397,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [], IIC_SSE_PALIGNR>, OpSize;
+ [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
}
@@ -5287,13 +5407,13 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
(ins VR256:$src1, VR256:$src2, i8imm:$src3),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, OpSize;
+ []>, OpSize, Sched<[WriteShuffle]>;
let mayLoad = 1 in
def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2, i8imm:$src3),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, OpSize;
+ []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
}
@@ -5341,6 +5461,7 @@ def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
// SSSE3 - Thread synchronization
//===---------------------------------------------------------------------===//
+let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
[(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
@@ -5354,13 +5475,14 @@ let Uses = [ECX, EAX] in
def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
[(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
TB, Requires<[HasSSE3]>;
+} // SchedRW
-def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
-def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
-def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
Requires<[In32BitMode]>;
-def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
@@ -6017,11 +6139,11 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
}
let ExeDomain = SSEPackedSingle in {
- let Predicates = [HasAVX] in {
+ let Predicates = [UseAVX] in {
defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
(ins VR128:$src1, i32i8imm:$src2),
- "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, OpSize, VEX;
}
defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
@@ -6773,7 +6895,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
- IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+ NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -6782,7 +6904,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
[(set RC:$dst,
(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
RC:$src3))],
- IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+ NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
}
let Predicates = [HasAVX] in {
@@ -6894,17 +7016,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
int_x86_sse41_pblendvb>;
// Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
let Predicates = [UseSSE41] in {
@@ -7144,62 +7266,62 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
let Constraints = "$src1 = $dst" in {
def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i8mem:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_8 GR32:$src1,
(load addr:$src2)))]>;
def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR8:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i16mem:$src2),
- "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ "crc32{w}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_16 GR32:$src1,
(load addr:$src2)))]>,
OpSize;
def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR16:$src2),
- "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ "crc32{w}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
OpSize;
def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i32mem:$src2),
- "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ "crc32{l}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_32 GR32:$src1,
(load addr:$src2)))]>;
def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ "crc32{l}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i8mem:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_8 GR64:$src1,
(load addr:$src2)))]>,
REX_W;
def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR8:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
REX_W;
def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i64mem:$src2),
- "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ "crc32{q}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_64 GR64:$src1,
(load addr:$src2)))]>,
REX_W;
def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ "crc32{q}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
REX_W;
@@ -7464,62 +7586,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX] in {
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
(bc_v4i32 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
(bc_v16i8 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
(bc_v8i16 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
//===----------------------------------------------------------------------===//
@@ -7539,59 +7661,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
// AVX1 patterns
let Predicates = [HasAVX] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4f32 (VEXTRACTF128rr
(v8f32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2f64 (VEXTRACTF128rr
(v4f64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2i64 (VEXTRACTF128rr
(v4i64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4i32 (VEXTRACTF128rr
(v8i32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v8i16 (VEXTRACTF128rr
(v16i16 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v16i8 (VEXTRACTF128rr
(v32i8 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
//===----------------------------------------------------------------------===//
@@ -8069,42 +8191,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX2] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
(bc_v4i32 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
(bc_v16i8 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
(bc_v8i16 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
//===----------------------------------------------------------------------===//
@@ -8123,39 +8245,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
VEX, VEX_L;
let Predicates = [HasAVX2] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2i64 (VEXTRACTI128rr
(v4i64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4i32 (VEXTRACTI128rr
(v8i32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v8i16 (VEXTRACTI128rr
(v16i16 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v16i8 (VEXTRACTI128rr
(v32i8 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
//===----------------------------------------------------------------------===//
@@ -8250,7 +8372,9 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
[]>, VEX_4VOp3, VEX_L;
}
-let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+let mayLoad = 1, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 757dcd0..0191c01 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
// 0F 01 D8
let Uses = [EAX] in
def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+ "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+ "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 1185941..1937770 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -15,26 +15,26 @@
let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
- "shl{b}\t{%cl, $dst|$dst, CL}",
+ "shl{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
- "shl{w}\t{%cl, $dst|$dst, CL}",
+ "shl{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
- "shl{l}\t{%cl, $dst|$dst, CL}",
+ "shl{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
- "shl{q}\t{%cl, $dst|$dst, CL}",
+ "shl{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
} // Uses = [CL]
def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-
+
let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
@@ -43,7 +43,7 @@ def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"shl{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
-def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
(ins GR64:$src1, i8imm:$src2),
"shl{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
@@ -62,24 +62,25 @@ def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
"shl{q}\t$dst", [], IIC_SR>;
} // hasSideEffects = 0
} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
// using CL?
let Uses = [CL] in {
def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
- "shl{b}\t{%cl, $dst|$dst, CL}",
+ "shl{b}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
- "shl{w}\t{%cl, $dst|$dst, CL}",
+ "shl{w}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
OpSize;
def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
- "shl{l}\t{%cl, $dst|$dst, CL}",
+ "shl{l}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
- "shl{q}\t{%cl, $dst|$dst, CL}",
+ "shl{q}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -118,20 +119,21 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t$dst",
[(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
IIC_SR>;
+} // SchedRW
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
- "shr{b}\t{%cl, $dst|$dst, CL}",
+ "shr{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
- "shr{w}\t{%cl, $dst|$dst, CL}",
+ "shr{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
- "shr{l}\t{%cl, $dst|$dst, CL}",
+ "shr{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
- "shr{q}\t{%cl, $dst|$dst, CL}",
+ "shr{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
}
@@ -163,22 +165,23 @@ def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
"shr{q}\t$dst",
[(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
- "shr{b}\t{%cl, $dst|$dst, CL}",
+ "shr{b}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
- "shr{w}\t{%cl, $dst|$dst, CL}",
+ "shr{w}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
OpSize;
def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
- "shr{l}\t{%cl, $dst|$dst, CL}",
+ "shr{l}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
- "shr{q}\t{%cl, $dst|$dst, CL}",
+ "shr{q}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -216,23 +219,24 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t$dst",
[(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
IIC_SR>;
+} // SchedRW
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
- "sar{b}\t{%cl, $dst|$dst, CL}",
+ "sar{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (sra GR8:$src1, CL))],
IIC_SR>;
def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
- "sar{w}\t{%cl, $dst|$dst, CL}",
+ "sar{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (sra GR16:$src1, CL))],
IIC_SR>, OpSize;
def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
- "sar{l}\t{%cl, $dst|$dst, CL}",
+ "sar{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (sra GR32:$src1, CL))],
IIC_SR>;
def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
- "sar{q}\t{%cl, $dst|$dst, CL}",
+ "sar{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (sra GR64:$src1, CL))],
IIC_SR>;
}
@@ -273,24 +277,25 @@ def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
"sar{q}\t$dst",
[(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
IIC_SR>;
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
- "sar{b}\t{%cl, $dst|$dst, CL}",
+ "sar{b}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
- "sar{w}\t{%cl, $dst|$dst, CL}",
+ "sar{w}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
- "sar{l}\t{%cl, $dst|$dst, CL}",
+ "sar{l}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
- "sar{q}\t{%cl, $dst|$dst, CL}",
+ "sar{q}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -330,20 +335,21 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t$dst",
[(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
IIC_SR>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Rotate instructions
//===----------------------------------------------------------------------===//
let hasSideEffects = 0 in {
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
"rcl{b}\t$dst", [], IIC_SR>;
def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
"rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
"rcl{w}\t$dst", [], IIC_SR>, OpSize;
@@ -351,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
"rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
"rcl{l}\t$dst", [], IIC_SR>;
@@ -359,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
"rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -368,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
"rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
@@ -377,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
"rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
"rcr{w}\t$dst", [], IIC_SR>, OpSize;
@@ -385,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
"rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
"rcr{l}\t$dst", [], IIC_SR>;
@@ -393,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
"rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
"rcr{q}\t$dst", [], IIC_SR>;
@@ -401,10 +407,11 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
"rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
} // Constraints = "$src = $dst"
+let SchedRW = [WriteShiftLd, WriteRMW] in {
def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
"rcl{b}\t$dst", [], IIC_SR>;
def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
@@ -441,39 +448,40 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
let Uses = [CL] in {
def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
}
+} // SchedRW
} // hasSideEffects = 0
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
// FIXME: provide shorter instructions when imm8 == 1
let Uses = [CL] in {
def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
- "rol{b}\t{%cl, $dst|$dst, CL}",
+ "rol{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
- "rol{w}\t{%cl, $dst|$dst, CL}",
+ "rol{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
- "rol{l}\t{%cl, $dst|$dst, CL}",
+ "rol{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
- "rol{q}\t{%cl, $dst|$dst, CL}",
+ "rol{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
}
@@ -512,23 +520,24 @@ def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
"rol{q}\t$dst",
[(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
IIC_SR>;
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
- "rol{b}\t{%cl, $dst|$dst, CL}",
+ "rol{b}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
- "rol{w}\t{%cl, $dst|$dst, CL}",
+ "rol{w}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
- "rol{l}\t{%cl, $dst|$dst, CL}",
+ "rol{l}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
- "rol{q}\t{%cl, $dst|$dst, %cl}",
+ "rol{q}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -568,20 +577,21 @@ def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t$dst",
[(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
IIC_SR>;
+} // SchedRW
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
- "ror{b}\t{%cl, $dst|$dst, CL}",
+ "ror{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
- "ror{w}\t{%cl, $dst|$dst, CL}",
+ "ror{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
- "ror{l}\t{%cl, $dst|$dst, CL}",
+ "ror{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
- "ror{q}\t{%cl, $dst|$dst, CL}",
+ "ror{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
}
@@ -620,23 +630,24 @@ def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
[(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
IIC_SR>;
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
- "ror{b}\t{%cl, $dst|$dst, CL}",
+ "ror{b}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
- "ror{w}\t{%cl, $dst|$dst, CL}",
+ "ror{w}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
- "ror{l}\t{%cl, $dst|$dst, CL}",
+ "ror{l}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
- "ror{q}\t{%cl, $dst|$dst, CL}",
+ "ror{q}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -676,46 +687,47 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
[(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
IIC_SR>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Double shift instructions (generalizations of rotate)
//===----------------------------------------------------------------------===//
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
- "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
IIC_SHD16_REG_CL>,
TB, OpSize;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
- "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
IIC_SHD16_REG_CL>,
TB, OpSize;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
IIC_SHD32_REG_CL>, TB;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
IIC_SHD32_REG_CL>, TB;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
IIC_SHD64_REG_CL>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
IIC_SHD64_REG_CL>,
TB;
@@ -765,33 +777,34 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(i8 imm:$src3)))], IIC_SHD64_REG_IM>,
TB;
}
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst", SchedRW
+let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)], IIC_SHD64_MEM_CL>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)], IIC_SHD64_MEM_CL>, TB;
}
@@ -840,6 +853,7 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
(i8 imm:$src3)), addr:$dst)],
IIC_SHD64_MEM_IM>,
TB;
+} // SchedRW
} // Defs = [EFLAGS]
@@ -857,12 +871,12 @@ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
let neverHasSideEffects = 1 in {
def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, TAXD, VEX;
+ []>, TAXD, VEX, Sched<[WriteShift]>;
let mayLoad = 1 in
def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
(ins x86memop:$src1, i8imm:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, TAXD, VEX;
+ []>, TAXD, VEX, Sched<[WriteShiftLd]>;
}
}
@@ -870,11 +884,17 @@ multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
let neverHasSideEffects = 1 in {
def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- VEX_4VOp3;
+ VEX_4VOp3, Sched<[WriteShift]>;
let mayLoad = 1 in
def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- VEX_4VOp3;
+ VEX_4VOp3,
+ Sched<[WriteShiftLd,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src1
+ ReadAfterLd]>;
}
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 3caa1b5..2196dc3 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -13,6 +13,7 @@
//
//===----------------------------------------------------------------------===//
+let SchedRW = [WriteSystem] in {
let Defs = [RAX, RDX] in
def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
TB;
@@ -35,6 +36,7 @@ let Uses = [EFLAGS] in
def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
[(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
def : Pat<(debugtrap),
(INT3)>;
@@ -43,6 +45,7 @@ def : Pat<(debugtrap),
// FIXME: This doesn't work because InstAlias can't match immediate constants.
//def : InstAlias<"int\t$3", (INT3)>;
+let SchedRW = [WriteSystem] in {
def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
[(int_x86_int imm:$trap)], IIC_INT>;
@@ -65,58 +68,62 @@ def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
Requires<[In64BitMode]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Input/Output Instructions.
//
+let SchedRW = [WriteSystem] in {
let Defs = [AL], Uses = [DX] in
def IN8rr : I<0xEC, RawFrm, (outs), (ins),
- "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
+ "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
let Defs = [AX], Uses = [DX] in
def IN16rr : I<0xED, RawFrm, (outs), (ins),
- "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize;
+ "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize;
let Defs = [EAX], Uses = [DX] in
def IN32rr : I<0xED, RawFrm, (outs), (ins),
- "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
+ "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>;
let Defs = [AL] in
def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
- "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
+ "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
let Defs = [AX] in
def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
+ "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize;
let Defs = [EAX] in
def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
+ "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>;
let Uses = [DX, AL] in
def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
- "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
+ "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
let Uses = [DX, AX] in
def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
- "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
+ "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize;
let Uses = [DX, EAX] in
def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
- "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
+ "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>;
let Uses = [AL] in
def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
- "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
+ "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
let Uses = [AX] in
def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
+ "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize;
let Uses = [EAX] in
def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
+ "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>;
def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize;
def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Moves to and from debug registers
+let SchedRW = [WriteSystem] in {
def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
"mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
@@ -126,10 +133,12 @@ def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
"mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Moves to and from control registers
+let SchedRW = [WriteSystem] in {
def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
"mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
@@ -139,6 +148,7 @@ def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
"mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Segment override instruction prefixes
@@ -155,6 +165,7 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
// Moves to and from segment registers.
//
+let SchedRW = [WriteMove] in {
def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
"mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
@@ -182,10 +193,12 @@ def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
"mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Segmentation support instructions.
+let SchedRW = [WriteSystem] in {
def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
@@ -235,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
"ltr{w}\t$src", [], IIC_LTR>, TB;
def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
- "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
- "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+ "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
- "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
- "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
- "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
- "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
- "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
- "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
- "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
+ "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
- "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
- "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
+ "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
- "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
- "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
+ "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB;
def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
- "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
+ "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB;
// No "pop cs" instruction.
def POPSS16 : I<0x17, RawFrm, (outs), (ins),
- "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
OpSize, Requires<[In32BitMode]>;
def POPSS32 : I<0x17, RawFrm, (outs), (ins),
- "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
Requires<[In32BitMode]>;
def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
- "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+ "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
OpSize, Requires<[In32BitMode]>;
def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
- "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+ "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
Requires<[In32BitMode]>;
def POPES16 : I<0x07, RawFrm, (outs), (ins),
- "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+ "pop{w}\t{%es|es}", [], IIC_POP_SR>,
OpSize, Requires<[In32BitMode]>;
def POPES32 : I<0x07, RawFrm, (outs), (ins),
- "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+ "pop{l}\t{%es|es}", [], IIC_POP_SR>,
Requires<[In32BitMode]>;
def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
- "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
+ "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB;
def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
- "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+ "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
- "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
+ "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB;
def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
- "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
+ "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB;
def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
- "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+ "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
- "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
+ "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB;
def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
@@ -347,10 +360,12 @@ def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
"verw\t$seg", [], IIC_VERW_MEM>, TB;
def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
"verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Descriptor-table support instructions
+let SchedRW = [WriteSystem] in {
def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
"sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
@@ -385,9 +400,11 @@ def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
"lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
"lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
-
+} // SchedRW
+
//===----------------------------------------------------------------------===//
// Specialized register support
+let SchedRW = [WriteSystem] in {
def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
@@ -410,14 +427,18 @@ def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
"lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Cache instructions
+let SchedRW = [WriteSystem] in {
def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// XSAVE instructions
+let SchedRW = [WriteSystem] in {
let Defs = [RDX, RAX], Uses = [RCX] in
def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
@@ -428,16 +449,17 @@ let Uses = [RDX, RAX] in {
def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
"xsave\t$dst", []>, TB;
def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
- "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ "xsave{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
"xrstor\t$dst", []>, TB;
def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
- "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ "xrstor{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
"xsaveopt\t$dst", []>, TB;
def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
- "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ "xsaveopt{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
}
+} // SchedRW
//===----------------------------------------------------------------------===//
// VIA PadLock crypto instructions
@@ -493,8 +515,15 @@ let Predicates = [HasFSGSBase, In64BitMode] in {
//===----------------------------------------------------------------------===//
// INVPCID Instruction
def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In32BitMode]>;
def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS], Uses = [EFLAGS] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index a37a8cc..59a6f1e 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -15,6 +15,9 @@
//===----------------------------------------------------------------------===//
// TSX instructions
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
let usesCustomInserter = 1 in
def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
"# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
@@ -27,6 +30,17 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
def XEND : I<0x01, MRM_D5, (outs), (ins),
"xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+ "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
[(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 44d8cce..fc86e1e 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -339,6 +339,7 @@ extern "C" {
/// must locate the start of the stub or call site and pass it into the JIT
/// compiler function.
extern "C" {
+LLVM_ATTRIBUTE_USED // Referenced from inline asm.
LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
intptr_t RetAddr) {
intptr_t *RetAddrLoc = &StackPtr[1];
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a8a9fd8..c7c00b5 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -225,32 +225,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
}
-
-static void lower_subreg32(MCInst *MI, unsigned OpNo) {
- // Convert registers in the addr mode according to subreg32.
- unsigned Reg = MI->getOperand(OpNo).getReg();
- if (Reg != 0)
- MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
-}
-
-static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
- // Convert registers in the addr mode according to subreg64.
- for (unsigned i = 0; i != 4; ++i) {
- if (!MI->getOperand(OpNo+i).isReg()) continue;
-
- unsigned Reg = MI->getOperand(OpNo+i).getReg();
- // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
- if (Reg == 0 || Reg == X86::RIP) continue;
-
- MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
- }
-}
-
-/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
-static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
- OutMI.setOpcode(NewOpc);
- lower_subreg32(&OutMI, 0);
-}
/// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R
static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
OutMI.setOpcode(NewOpc);
@@ -280,6 +254,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
Inst.addOperand(Saved);
}
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+ unsigned NewOpcode = 0;
+ unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ if (Op0 == X86::AX && Op1 == X86::AL)
+ NewOpcode = X86::CBW;
+ break;
+ case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
+ if (Op0 == X86::EAX && Op1 == X86::AX)
+ NewOpcode = X86::CWDE;
+ break;
+ case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+ if (Op0 == X86::RAX && Op1 == X86::EAX)
+ NewOpcode = X86::CDQE;
+ break;
+ }
+
+ if (NewOpcode != 0) {
+ Inst = MCInst();
+ Inst.setOpcode(NewOpcode);
+ }
+}
+
/// \brief Simplify things like MOV32rm to MOV32o32a.
static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
unsigned Opcode) {
@@ -376,9 +378,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
// Handle a few special cases to eliminate operand modifiers.
ReSimplify:
switch (OutMI.getOpcode()) {
- case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
- lower_lea64_32mem(&OutMI, 1);
- // FALL THROUGH.
+ case X86::LEA64_32r:
case X86::LEA64r:
case X86::LEA16r:
case X86::LEA32r:
@@ -388,23 +388,10 @@ ReSimplify:
assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
"LEA has segment specified!");
break;
- case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
- case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
- case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
- case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
- case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
- case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
- case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
- case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
- case X86::MOV16r0:
- LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0
- LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
- break;
- case X86::MOV64r0:
- LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0
- LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+ case X86::MOV32ri64:
+ OutMI.setOpcode(X86::MOV32ri);
break;
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
@@ -598,6 +585,13 @@ ReSimplify:
case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break;
case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break;
+ // Try to shrink some forms of movsx.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr32:
+ SimplifyMOVSX(OutMI);
+ break;
+
case X86::MORESTACK_RET:
OutMI.setOpcode(X86::RET);
break;
@@ -695,13 +689,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(Mang, *MF, *this);
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
- if (isVerbose() && OutStreamer.hasRawTextSupport()) {
- std::string TmpStr;
- raw_string_ostream OS(TmpStr);
- PrintDebugValueComment(MI, OS);
- OutStreamer.EmitRawText(StringRef(OS.str()));
- }
- return;
+ llvm_unreachable("Should be handled target independently");
// Emit nothing here but a comment if we can.
case X86::Int_MemBarrier:
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 16886e4..0923310 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -54,15 +54,14 @@ static cl::opt<bool>
EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
- const TargetInstrInfo &tii)
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
: X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
? X86::RIP : X86::EIP),
X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
(tm.getSubtarget<X86Subtarget>().is64Bit()
? X86::RIP : X86::EIP)),
- TM(tm), TII(tii) {
+ TM(tm) {
X86_MC::InitLLVM2SEHRegisterMapping(this);
// Cache some information.
@@ -242,6 +241,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
case CallingConv::Intel_OCL_BI: {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_SaveList;
if (HasAVX && IsWin64)
return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
if (HasAVX && Is64Bit)
@@ -276,8 +280,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
if (CC == CallingConv::Intel_OCL_BI) {
+ if (IsWin64 && HasAVX512)
+ return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+ if (Is64Bit && HasAVX512)
+ return CSR_64_Intel_OCL_BI_AVX512_RegMask;
if (IsWin64 && HasAVX)
return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
if (Is64Bit && HasAVX)
@@ -306,19 +315,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
// Set the stack-pointer register and its aliases as reserved.
- Reserved.set(X86::RSP);
- for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+ for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
Reserved.set(*I);
// Set the instruction pointer register and its aliases as reserved.
- Reserved.set(X86::RIP);
- for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+ for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
Reserved.set(*I);
// Set the frame-pointer register and its aliases as reserved if needed.
if (TFI->hasFP(MF)) {
- Reserved.set(X86::RBP);
- for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+ for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+ ++I)
Reserved.set(*I);
}
@@ -331,8 +340,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
"Stack realignment in presence of dynamic allocas is not supported with"
"this calling convention.");
- Reserved.set(getBaseRegister());
- for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+ for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
+ I.isValid(); ++I)
Reserved.set(*I);
}
@@ -345,14 +354,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(X86::GS);
// Mark the floating point stack registers as reserved.
- Reserved.set(X86::ST0);
- Reserved.set(X86::ST1);
- Reserved.set(X86::ST2);
- Reserved.set(X86::ST3);
- Reserved.set(X86::ST4);
- Reserved.set(X86::ST5);
- Reserved.set(X86::ST6);
- Reserved.set(X86::ST7);
+ for (unsigned n = 0; n != 8; ++n)
+ Reserved.set(X86::ST0 + n);
// Reserve the registers that only exist in 64-bit mode.
if (!Is64Bit) {
@@ -365,19 +368,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (unsigned n = 0; n != 8; ++n) {
// R8, R9, ...
- static const uint16_t GPR64[] = {
- X86::R8, X86::R9, X86::R10, X86::R11,
- X86::R12, X86::R13, X86::R14, X86::R15
- };
- for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+ for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
Reserved.set(*AI);
// XMM8, XMM9, ...
- assert(X86::XMM15 == X86::XMM8+7);
for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
Reserved.set(*AI);
}
}
+ if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+ for (unsigned n = 16; n != 32; ++n) {
+ for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
return Reserved;
}
@@ -407,10 +411,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
}
bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+ return false;
+
const MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- if (!MF.getTarget().Options.RealignStack)
- return false;
// Stack realignment requires a frame pointer. If we already started
// register allocation with frame pointer elimination, it is too late now.
@@ -688,4 +693,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
}
}
}
+
+unsigned get512BitSuperRegister(unsigned Reg) {
+ if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+ return X86::ZMM0 + (Reg - X86::XMM0);
+ if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+ return X86::ZMM0 + (Reg - X86::YMM0);
+ if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+ return Reg;
+ llvm_unreachable("Unexpected SIMD register");
+}
+
}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index b9d7b8c..fb17682 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -27,7 +27,6 @@ namespace llvm {
class X86RegisterInfo : public X86GenRegisterInfo {
public:
X86TargetMachine &TM;
- const TargetInstrInfo &TII;
private:
/// Is64Bit - Is the target 64-bits.
@@ -56,7 +55,7 @@ private:
unsigned BasePtr;
public:
- X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+ X86RegisterInfo(X86TargetMachine &tm);
// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const;
@@ -138,6 +137,9 @@ public:
// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index be6282a..b802728 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -21,11 +21,12 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n>
// Subregister indices.
let Namespace = "X86" in {
- def sub_8bit : SubRegIndex;
- def sub_8bit_hi : SubRegIndex;
- def sub_16bit : SubRegIndex;
- def sub_32bit : SubRegIndex;
- def sub_xmm : SubRegIndex;
+ def sub_8bit : SubRegIndex<8>;
+ def sub_8bit_hi : SubRegIndex<8, 8>;
+ def sub_16bit : SubRegIndex<16>;
+ def sub_32bit : SubRegIndex<32>;
+ def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
}
//===----------------------------------------------------------------------===//
@@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
} // CostPerUse
-// YMM Registers, used by AVX instructions
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
let SubRegIndices = [sub_xmm] in {
-def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>;
-def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>;
-def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>;
-def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>;
-def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>;
-def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>;
-def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>;
-def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>;
-def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>;
-def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>;
-def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
-def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
-def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
-def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
-def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
-def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+ foreach Index = 0-31 in {
+ def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+ foreach Index = 0-31 in {
+ def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
}
+ // Mask Registers, used by AVX-512 instructions.
+ def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+ def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+ def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+ def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+ def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+ def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+ def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+ def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
let Aliases = A;
}
@@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
let CopyCost = -1; // Don't allow copying of status registers.
let isAllocatable = 0;
}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512,
+ (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ 256, (sequence "YMM%u", 0, 31)>;
+
+def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>;
+def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>;
+
+def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>;
+def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>;
+
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 0000000..62ba2bc
--- /dev/null
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,132 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, 6 and 7 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+ HWPort5, HWPort6, HWPort7]> {
+ let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [HWPort0156]>;
+def : WriteRes<WriteZero, []>;
+
+defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort056, 1>;
+defm : HWWriteResPair<WriteJump, HWPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort05, 1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
+defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
+defm : HWWriteResPair<WriteShuffle, HWPort15, 1>;
+
+def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+} // SchedModel
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 0000000..52ead94
--- /dev/null
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,127 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+ // instructions per cycle.
+ // FIXME: Identify instructions that aren't a single fused micro-op.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 168; // Based on the reorder buffer.
+ let LoadLatency = 4;
+ let MispredictPenalty = 16;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+ let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+ let Latency = !add(Lat, 4);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove, [SBPort015]>;
+def : WriteRes<WriteZero, []>;
+
+defm : SBWriteResPair<WriteALU, SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05, 1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU, SBPort15, 1>;
+defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
+defm : SBWriteResPair<WriteShuffle, SBPort15, 1>;
+
+def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+} // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index da0ca7d..ceb2e05 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -42,6 +42,7 @@ multiclass X86SchedWritePair {
// Arithmetic.
defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
defm WriteIDiv : X86SchedWritePair; // Integer division.
def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
@@ -53,6 +54,10 @@ def WriteLoad : SchedWrite;
def WriteStore : SchedWrite;
def WriteMove : SchedWrite;
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
defm WriteJump : X86SchedWritePair;
@@ -63,6 +68,10 @@ defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
defm WriteFDiv : X86SchedWritePair; // Floating point division.
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
+defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
@@ -79,9 +88,14 @@ defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
//===----------------------------------------------------------------------===//
// Instruction Itinerary classes used for X86
-def IIC_DEFAULT : InstrItinClass;
def IIC_ALU_MEM : InstrItinClass;
def IIC_ALU_NONMEM : InstrItinClass;
def IIC_LEA : InstrItinClass;
@@ -253,10 +267,14 @@ def IIC_SSE_PINSRW : InstrItinClass;
def IIC_SSE_PABS_RR : InstrItinClass;
def IIC_SSE_PABS_RM : InstrItinClass;
-def IIC_SSE_SQRTP_RR : InstrItinClass;
-def IIC_SSE_SQRTP_RM : InstrItinClass;
-def IIC_SSE_SQRTS_RR : InstrItinClass;
-def IIC_SSE_SQRTS_RM : InstrItinClass;
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
def IIC_SSE_RCPP_RR : InstrItinClass;
def IIC_SSE_RCPP_RM : InstrItinClass;
@@ -533,8 +551,9 @@ def IIC_NOP : InstrItinClass;
// Resources beyond the decoder operate on micro-ops and are bufferred
// so adjacent micro-ops don't directly compete.
//
-// MinLatency=0 indicates that RAW dependencies can be decoded in the
-// same cycle.
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
//
// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
// indicates high latency opcodes. Alternatively, InstrItinData
@@ -542,17 +561,14 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact.
//
-// ILPWindow=10 is an arbitrary threshold that approximates cycles of
-// latency hidden by instruction buffers. The actual value is not very
-// important but should be zero for inorder and nonzero for OOO processors.
-//
// The GenericModel contains no instruciton itineraries.
def GenericModel : SchedMachineModel {
let IssueWidth = 4;
- let MinLatency = 0;
+ let MicroOpBufferSize = 32;
let LoadLatency = 4;
let HighLatency = 10;
- let ILPWindow = 10;
}
include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 1e5f2d6..14a1471 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -33,7 +33,6 @@ def AtomItineraries : ProcessorItineraries<
// InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
//
// Default is 1 cycle, port0 or port1
- InstrItinData<IIC_DEFAULT, [InstrStage<1, [Port0, Port1]>] >,
InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
@@ -212,10 +211,15 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
- InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
- InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
@@ -521,11 +525,9 @@ def AtomItineraries : ProcessorItineraries<
// Atom machine model.
def AtomModel : SchedMachineModel {
let IssueWidth = 2; // Allows 2 instructions per scheduling group.
- let MinLatency = 1; // InstrStage cycles overrides MinLatency.
- // OperandCycles may be used for expected latency.
+ let MicroOpBufferSize = 0; // In-order execution, always hide latency.
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles.
- let ILPWindow = 0; // Always try to hide expected latency.
let Itineraries = AtomItineraries;
}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index f934fdd..d1db79f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -27,7 +27,7 @@ X86SelectionDAGInfo::~X86SelectionDAGInfo() {
}
SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
@@ -175,7 +175,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
}
SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
bool isVolatile, bool AlwaysInline,
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index d1d66fe..d728af5 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -34,7 +34,7 @@ public:
~X86SelectionDAGInfo();
virtual
- SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
@@ -42,7 +42,7 @@ public:
MachinePointerInfo DstPtrInfo) const;
virtual
- SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
SDValue Chain,
SDValue Dst, SDValue Src,
SDValue Size, unsigned Align,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 0f2c008..fae90f2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -37,8 +37,7 @@ using namespace llvm;
/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
/// current subtarget according to how we should reference it in a non-pcrel
/// context.
-unsigned char X86Subtarget::
-ClassifyBlockAddressReference() const {
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
if (isPICStyleGOT()) // 32-bit ELF targets.
return X86II::MO_GOTOFF;
@@ -171,6 +170,26 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
}
+static bool OSHasAVXSupport() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+#if defined(__GNUC__)
+ // Check xgetbv; this uses a .byte sequence instead of the instruction
+ // directly because older assemblers do not include support for xgetbv and
+ // there is no easy way to conditionally compile based on the assembler used.
+ int rEAX, rEDX;
+ __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+ unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+ int rEAX = 0; // Ensures we return false
+#endif
+ return (rEAX & 6) == 6;
+#else
+ return false;
+#endif
+}
+
void X86Subtarget::AutoDetectSubtargetFeatures() {
unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
unsigned MaxLevel;
@@ -193,7 +212,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
- if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); }
+ if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
+ X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX);
+ }
bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
@@ -283,6 +304,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasLZCNT = true;
ToggleFeature(X86::FeatureLZCNT);
}
+ if (IsIntel && ((ECX >> 8) & 0x1)) {
+ HasPRFCHW = true;
+ ToggleFeature(X86::FeaturePRFCHW);
+ }
if (IsAMD) {
if ((ECX >> 6) & 0x1) {
HasSSE4A = true;
@@ -310,6 +335,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasBMI = true;
ToggleFeature(X86::FeatureBMI);
}
+ if ((EBX >> 4) & 0x1) {
+ HasHLE = true;
+ ToggleFeature(X86::FeatureHLE);
+ }
if (IsIntel && ((EBX >> 5) & 0x1)) {
X86SSELevel = AVX2;
ToggleFeature(X86::FeatureAVX2);
@@ -322,6 +351,14 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasRTM = true;
ToggleFeature(X86::FeatureRTM);
}
+ if (IsIntel && ((EBX >> 19) & 0x1)) {
+ HasADX = true;
+ ToggleFeature(X86::FeatureADX);
+ }
+ if (IsIntel && ((EBX >> 18) & 0x1)) {
+ HasRDSEED = true;
+ ToggleFeature(X86::FeatureRDSEED);
+ }
}
}
}
@@ -439,7 +476,13 @@ void X86Subtarget::initializeEnvironment() {
HasBMI = false;
HasBMI2 = false;
HasRTM = false;
+ HasHLE = false;
+ HasERI = false;
+ HasCDI = false;
+ HasPFI=false;
HasADX = false;
+ HasPRFCHW = false;
+ HasRDSEED = false;
IsBTMemSlow = false;
IsUAMemFast = false;
HasVectorUAMem = false;
@@ -448,6 +491,8 @@ void X86Subtarget::initializeEnvironment() {
HasSlowDivide = false;
PostRAScheduler = false;
PadShortFunctions = false;
+ CallRegIndirect = false;
+ LEAUsesAG = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index e97da4b..8793238 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -42,7 +42,7 @@ enum Style {
class X86Subtarget : public X86GenSubtargetInfo {
protected:
enum X86SSEEnum {
- NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
};
enum X863DNowEnum {
@@ -121,9 +121,18 @@ protected:
/// HasRTM - Processor has RTM instructions.
bool HasRTM;
+ /// HasHLE - Processor has HLE.
+ bool HasHLE;
+
/// HasADX - Processor has ADX instructions.
bool HasADX;
+ /// HasPRFCHW - Processor has PRFCHW instructions.
+ bool HasPRFCHW;
+
+ /// HasRDSEED - Processor has RDSEED instructions.
+ bool HasRDSEED;
+
/// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
bool IsBTMemSlow;
@@ -153,6 +162,22 @@ protected:
/// a stall when returning too early.
bool PadShortFunctions;
+ /// CallRegIndirect - True if the Calls with memory reference should be converted
+ /// to a register-based indirect call.
+ bool CallRegIndirect;
+ /// LEAUsesAG - True if the LEA instruction inputs have to be ready at
+ /// address generation (AG) time.
+ bool LEAUsesAG;
+
+ /// Processor has AVX-512 PreFetch Instructions
+ bool HasPFI;
+
+ /// Processor has AVX-512 Exponential and Reciprocal Instructions
+ bool HasERI;
+
+ /// Processor has AVX-512 Conflict Detection Instructions
+ bool HasCDI;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
@@ -233,6 +258,7 @@ public:
bool hasSSE42() const { return X86SSELevel >= SSE42; }
bool hasAVX() const { return X86SSELevel >= AVX; }
bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512; }
bool hasFp256() const { return hasAVX(); }
bool hasInt256() const { return hasAVX2(); }
bool hasSSE4A() const { return HasSSE4A; }
@@ -253,7 +279,10 @@ public:
bool hasBMI() const { return HasBMI; }
bool hasBMI2() const { return HasBMI2; }
bool hasRTM() const { return HasRTM; }
+ bool hasHLE() const { return HasHLE; }
bool hasADX() const { return HasADX; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
+ bool hasRDSEED() const { return HasRDSEED; }
bool isBTMemSlow() const { return IsBTMemSlow; }
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
bool hasVectorUAMem() const { return HasVectorUAMem; }
@@ -261,6 +290,11 @@ public:
bool useLeaForSP() const { return UseLeaForSP; }
bool hasSlowDivide() const { return HasSlowDivide; }
bool padShortFunctions() const { return PadShortFunctions; }
+ bool callRegIndirect() const { return CallRegIndirect; }
+ bool LEAusesAG() const { return LEAUsesAG; }
+ bool hasCDI() const { return HasCDI; }
+ bool hasPFI() const { return HasPFI; }
+ bool hasERI() const { return HasERI; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
@@ -317,7 +351,13 @@ public:
}
bool isPICStyleStubAny() const {
return PICStyle == PICStyles::StubDynamicNoPIC ||
- PICStyle == PICStyles::StubPIC; }
+ PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
+ CC == CallingConv::X86_64_Win64;
+ }
/// ClassifyGlobalReference - Classify a global variable reference for the
/// current subtarget according to how we should reference it in a non-pcrel
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8aa58a2..49ebd1a 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -49,6 +49,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
TLInfo(*this),
TSInfo(*this),
JITInfo(*this) {
+ initAsmInfo();
}
void X86_64TargetMachine::anchor() { }
@@ -69,6 +70,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
TLInfo(*this),
TSInfo(*this),
JITInfo(*this) {
+ initAsmInfo();
}
/// X86TargetMachine ctor - Create an X86 target.
@@ -130,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
// Add first the target-independent BasicTTI pass, then our X86 pass. This
// allows the X86 pass to delegate to the target independent layer when
// appropriate.
- PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+ PM.add(createBasicTargetTransformInfoPass(this));
PM.add(createX86TargetTransformInfoPass(this));
}
@@ -215,6 +217,11 @@ bool X86PassConfig::addPreEmitPass() {
addPass(createX86PadShortFunctions());
ShouldPrint = true;
}
+ if (getOptLevel() != CodeGenOpt::None &&
+ getX86Subtarget().LEAusesAG()){
+ addPass(createX86FixupLEAs());
+ ShouldPrint = true;
+ }
return ShouldPrint;
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 871dacd..a19c5a6 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(TM.Options.UseInitArray);
}
+
+const MCExpr *
+X86LinuxTargetObjectFile::getDebugThreadLocalSymbol(
+ const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 9d26d38..79c861d 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -36,6 +36,9 @@ namespace llvm {
/// and x86-64.
class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+ /// \brief Describe a TLS variable address within debug info.
+ virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index be2a997..3bbddad 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &);
namespace {
class X86TTI : public ImmutablePass, public TargetTransformInfo {
- const X86TargetMachine *TM;
const X86Subtarget *ST;
const X86TargetLowering *TLI;
@@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+ X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
llvm_unreachable("This pass cannot be directly constructed");
}
X86TTI(const X86TargetMachine *TM)
- : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+ : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
TLI(TM->getTargetLowering()) {
initializeX86TTIPass(*PassRegistry::getPassRegistry());
}
@@ -86,7 +85,9 @@ public:
virtual unsigned getNumberOfRegisters(bool Vector) const;
virtual unsigned getRegisterBitWidth(bool Vector) const;
virtual unsigned getMaximumUnrollFactor() const;
- virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+ virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind,
+ OperandValueKind) const;
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
int Index, Type *SubTp) const;
virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -99,6 +100,8 @@ public:
unsigned Alignment,
unsigned AddressSpace) const;
+ virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
+
/// @}
};
@@ -162,13 +165,134 @@ unsigned X86TTI::getMaximumUnrollFactor() const {
return 2;
}
-unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
+unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ OperandValueKind Op1Info,
+ OperandValueKind Op2Info) const {
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry<MVT> AVX2CostTable[] = {
+ // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // customize them to detect the cases where shift amount is a scalar one.
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
+
+ { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
+ { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized.
+
+ { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
+ { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized.
+
+ { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
+ { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
+ { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX2CostTable[Idx].Cost;
+ }
+
+ static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // Constant splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i8, 1 }, // psllw.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
+ array_lengthof(SSE2UniformConstCostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+ }
+
+
+ static const CostTblEntry<MVT> SSE2CostTable[] = {
+ // We don't correctly identify costs of casts because they are marked as
+ // custom.
+ // For some cases, where the shift amount is a scalar we would be able
+ // to generate better code. Unfortunately, when this is the case the value
+ // (the splat) will get hoisted out of the loop, thereby making it invisible
+ // to ISel. The cost model must return worst case assumptions because it is
+ // used for vectorization and we don't want to make vectorized code worse
+ // than scalar code.
+ { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence.
+ { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
+
+ { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
+ { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
+
+ { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized.
+ { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized.
+ { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ { ISD::SDIV, MVT::v16i8, 16*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v16i8, 16*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
+ };
+
+ if (ST->hasSSE2()) {
+ int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * SSE2CostTable[Idx].Cost;
+ }
+
static const CostTblEntry<MVT> AVX1CostTable[] = {
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
@@ -213,7 +337,8 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
return 6;
// Fallback to the default implementation.
- return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
+ return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+ Op2Info);
}
unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
@@ -235,9 +360,44 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
+ std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+
+ static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ // There are faster sequences for float conversions.
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ };
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
+ array_lengthof(SSE2ConvTbl),
+ ISD, LTDest.second, LTSrc.second);
+ if (Idx != -1)
+ return LTSrc.first * SSE2ConvTbl[Idx].Cost;
+ }
+
EVT SrcTy = TLI->getValueType(Src);
EVT DstTy = TLI->getValueType(Dst);
+ // The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
@@ -248,17 +408,40 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+
{ ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
};
@@ -310,19 +493,22 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
};
if (ST->hasAVX2()) {
- int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * AVX2CostTbl[Idx].Cost;
}
if (ST->hasAVX()) {
- int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * AVX1CostTbl[Idx].Cost;
}
if (ST->hasSSE42()) {
- int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * SSE42CostTbl[Idx].Cost;
}
@@ -354,8 +540,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
}
+unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
+ bool Extract) const {
+ assert (Ty->isVectorTy() && "Can only scalarize vectors");
+ unsigned Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const {
+ // Handle non power of two vectors such as <3 x float>
+ if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+ unsigned NumElem = VTy->getVectorNumElements();
+
+ // Handle a few common cases:
+ // <3 x float>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+ // Cost = 64 bit store + extract + 32 bit store.
+ return 3;
+
+ // <3 x double>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+ // Cost = 128 bit store + unpack + 64 bit store.
+ return 3;
+
+ // Assume that all other non power-of-two numbers are scalarized.
+ if (!isPowerOf2_32(NumElem)) {
+ unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
+ VTy->getScalarType(),
+ Alignment,
+ AddressSpace);
+ unsigned SplitCost = getScalarizationOverhead(Src,
+ Opcode == Instruction::Load,
+ Opcode==Instruction::Store);
+ return NumElem * Cost + SplitCost;
+ }
+ }
+
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -371,3 +600,16 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
return Cost;
}
+
+unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
+}
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 0f77948..477f75a 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
}
static bool isYmmReg(unsigned Reg) {
- if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
- return true;
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM31);
+}
- return false;
+static bool isZmmReg(unsigned Reg) {
+ return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31);
}
static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
E = MRI.livein_end(); I != E; ++I)
- if (isYmmReg(I->first))
+ if (isYmmReg(I->first) || isZmmReg(I->first))
return true;
return false;
}
static bool clobbersAllYmmRegs(const MachineOperand &MO) {
- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+ for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) {
if (!MO.clobbersPhysReg(reg))
return false;
}