aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp620
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/Disassembler/LLVMBuild.txt2
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp235
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.h46
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.c57
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h15
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h19
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp69
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h7
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp29
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.h2
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp65
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.h6
-rw-r--r--lib/Target/X86/MCTargetDesc/CMakeLists.txt4
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp64
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h72
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp224
-rw-r--r--lib/Target/X86/MCTargetDesc/X86FixupKinds.h2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h18
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp169
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp31
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h6
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp65
-rw-r--r--lib/Target/X86/README.txt34
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp80
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h24
-rw-r--r--lib/Target/X86/X86.td100
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp43
-rw-r--r--lib/Target/X86/X86COFFMachineModuleInfo.cpp2
-rw-r--r--lib/Target/X86/X86COFFMachineModuleInfo.h2
-rw-r--r--lib/Target/X86/X86CallingConv.td45
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp5
-rw-r--r--lib/Target/X86/X86ELFWriterInfo.cpp8
-rw-r--r--lib/Target/X86/X86FastISel.cpp34
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp24
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp224
-rw-r--r--lib/Target/X86/X86FrameLowering.h2
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp380
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp3551
-rw-r--r--lib/Target/X86/X86ISelLowering.h145
-rw-r--r--lib/Target/X86/X86Instr3DNow.td2
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td233
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td28
-rw-r--r--lib/Target/X86/X86InstrCompiler.td92
-rw-r--r--lib/Target/X86/X86InstrControl.td189
-rw-r--r--lib/Target/X86/X86InstrExtension.td84
-rw-r--r--lib/Target/X86/X86InstrFMA.td504
-rw-r--r--lib/Target/X86/X86InstrFPStack.td63
-rw-r--r--lib/Target/X86/X86InstrFormats.td309
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td157
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp306
-rw-r--r--lib/Target/X86/X86InstrInfo.h2
-rw-r--r--lib/Target/X86/X86InstrInfo.td115
-rw-r--r--lib/Target/X86/X86InstrMMX.td30
-rw-r--r--lib/Target/X86/X86InstrSSE.td4645
-rw-r--r--lib/Target/X86/X86InstrSVM.td62
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td513
-rw-r--r--lib/Target/X86/X86InstrSystem.td18
-rw-r--r--lib/Target/X86/X86InstrVMX.td10
-rw-r--r--lib/Target/X86/X86InstrXOP.td270
-rw-r--r--lib/Target/X86/X86JITInfo.cpp6
-rw-r--r--lib/Target/X86/X86JITInfo.h2
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp20
-rw-r--r--lib/Target/X86/X86MCInstLower.h2
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.cpp14
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h8
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp192
-rw-r--r--lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--lib/Target/X86/X86RegisterInfo.td2
-rw-r--r--lib/Target/X86/X86Relocations.h2
-rw-r--r--lib/Target/X86/X86Schedule.td262
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td294
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp3
-rw-r--r--lib/Target/X86/X86Subtarget.cpp71
-rw-r--r--lib/Target/X86/X86Subtarget.h73
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp52
-rw-r--r--lib/Target/X86/X86TargetMachine.h28
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h2
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp2
82 files changed, 8761 insertions, 6452 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f4639a3..d91830f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -31,10 +31,9 @@ using namespace llvm;
namespace {
struct X86Operand;
-class X86ATTAsmParser : public MCTargetAsmParser {
+class X86AsmParser : public MCTargetAsmParser {
MCSubtargetInfo &STI;
MCAsmParser &Parser;
-
private:
MCAsmParser &getParser() const { return Parser; }
@@ -45,12 +44,24 @@ private:
return Parser.Error(L, Msg, Ranges);
}
+ X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+ Error(Loc, Msg);
+ return 0;
+ }
+
X86Operand *ParseOperand();
+ X86Operand *ParseATTOperand();
+ X86Operand *ParseIntelOperand();
+ X86Operand *ParseIntelMemOperand();
+ X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+ bool processInstruction(MCInst &Inst,
+ const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+
bool MatchAndEmitInstruction(SMLoc IDLoc,
SmallVectorImpl<MCParsedAsmOperand*> &Operands,
MCStreamer &Out);
@@ -81,7 +92,7 @@ private:
/// }
public:
- X86ATTAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
+ X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
: MCTargetAsmParser(), STI(sti), Parser(parser) {
// Initialize the set of available features.
@@ -93,6 +104,10 @@ public:
SmallVectorImpl<MCParsedAsmOperand*> &Operands);
virtual bool ParseDirective(AsmToken DirectiveID);
+
+ bool isParsingIntelSyntax() {
+ return getParser().getAssemblerDialect();
+ }
};
} // end anonymous namespace
@@ -103,6 +118,31 @@ static unsigned MatchRegisterName(StringRef Name);
/// }
+static bool isImmSExti16i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti32i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmZExtu32u8Value(uint64_t Value) {
+ return (Value <= 0x00000000000000FFULL);
+}
+
+static bool isImmSExti64i8Value(uint64_t Value) {
+ return (( Value <= 0x000000000000007FULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti64i32Value(uint64_t Value) {
+ return (( Value <= 0x000000007FFFFFFFULL)||
+ (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
namespace {
/// X86Operand - Instances of this class represent a parsed X86 machine
@@ -137,6 +177,7 @@ struct X86Operand : public MCParsedAsmOperand {
unsigned BaseReg;
unsigned IndexReg;
unsigned Scale;
+ unsigned Size;
} Mem;
};
@@ -209,10 +250,7 @@ struct X86Operand : public MCParsedAsmOperand {
// Otherwise, check the value is in a range that makes sense for this
// extension.
- uint64_t Value = CE->getValue();
- return (( Value <= 0x000000000000007FULL)||
- (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isImmSExti16i8Value(CE->getValue());
}
bool isImmSExti32i8() const {
if (!isImm())
@@ -226,10 +264,7 @@ struct X86Operand : public MCParsedAsmOperand {
// Otherwise, check the value is in a range that makes sense for this
// extension.
- uint64_t Value = CE->getValue();
- return (( Value <= 0x000000000000007FULL)||
- (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isImmSExti32i8Value(CE->getValue());
}
bool isImmZExtu32u8() const {
if (!isImm())
@@ -243,8 +278,7 @@ struct X86Operand : public MCParsedAsmOperand {
// Otherwise, check the value is in a range that makes sense for this
// extension.
- uint64_t Value = CE->getValue();
- return (Value <= 0x00000000000000FFULL);
+ return isImmZExtu32u8Value(CE->getValue());
}
bool isImmSExti64i8() const {
if (!isImm())
@@ -258,9 +292,7 @@ struct X86Operand : public MCParsedAsmOperand {
// Otherwise, check the value is in a range that makes sense for this
// extension.
- uint64_t Value = CE->getValue();
- return (( Value <= 0x000000000000007FULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isImmSExti64i8Value(CE->getValue());
}
bool isImmSExti64i32() const {
if (!isImm())
@@ -274,12 +306,31 @@ struct X86Operand : public MCParsedAsmOperand {
// Otherwise, check the value is in a range that makes sense for this
// extension.
- uint64_t Value = CE->getValue();
- return (( Value <= 0x000000007FFFFFFFULL)||
- (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isImmSExti64i32Value(CE->getValue());
}
bool isMem() const { return Kind == Memory; }
+ bool isMem8() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+ }
+ bool isMem16() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+ }
+ bool isMem32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+ }
+ bool isMem64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+ }
+ bool isMem80() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+ }
+ bool isMem128() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+ }
+ bool isMem256() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+ }
bool isAbsMem() const {
return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
@@ -306,6 +357,28 @@ struct X86Operand : public MCParsedAsmOperand {
addExpr(Inst, getImm());
}
+ void addMem8Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem16Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem80Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem128Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem256Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
@@ -317,7 +390,11 @@ struct X86Operand : public MCParsedAsmOperand {
void addAbsMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 1) && "Invalid number of operands!");
- Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+ Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
}
static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
@@ -342,20 +419,22 @@ struct X86Operand : public MCParsedAsmOperand {
/// Create an absolute memory operand.
static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
- SMLoc EndLoc) {
+ SMLoc EndLoc, unsigned Size = 0) {
X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = 0;
Res->Mem.IndexReg = 0;
Res->Mem.Scale = 1;
+ Res->Mem.Size = Size;
return Res;
}
/// Create a generalized memory operand.
static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, SMLoc StartLoc, SMLoc EndLoc) {
+ unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+ unsigned Size = 0) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -369,13 +448,14 @@ struct X86Operand : public MCParsedAsmOperand {
Res->Mem.BaseReg = BaseReg;
Res->Mem.IndexReg = IndexReg;
Res->Mem.Scale = Scale;
+ Res->Mem.Size = Size;
return Res;
}
};
} // end anonymous namespace.
-bool X86ATTAsmParser::isSrcOp(X86Operand &Op) {
+bool X86AsmParser::isSrcOp(X86Operand &Op) {
unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI;
return (Op.isMem() &&
@@ -385,7 +465,7 @@ bool X86ATTAsmParser::isSrcOp(X86Operand &Op) {
Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0);
}
-bool X86ATTAsmParser::isDstOp(X86Operand &Op) {
+bool X86AsmParser::isDstOp(X86Operand &Op) {
unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
return Op.isMem() && Op.Mem.SegReg == X86::ES &&
@@ -394,18 +474,22 @@ bool X86ATTAsmParser::isDstOp(X86Operand &Op) {
Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
}
-bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
- SMLoc &StartLoc, SMLoc &EndLoc) {
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
RegNo = 0;
- const AsmToken &TokPercent = Parser.getTok();
- assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
- StartLoc = TokPercent.getLoc();
- Parser.Lex(); // Eat percent token.
+ if (!isParsingIntelSyntax()) {
+ const AsmToken &TokPercent = Parser.getTok();
+ assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
+ StartLoc = TokPercent.getLoc();
+ Parser.Lex(); // Eat percent token.
+ }
const AsmToken &Tok = Parser.getTok();
- if (Tok.isNot(AsmToken::Identifier))
+ if (Tok.isNot(AsmToken::Identifier)) {
+ if (isParsingIntelSyntax()) return true;
return Error(StartLoc, "invalid register name",
SMRange(StartLoc, Tok.getEndLoc()));
+ }
RegNo = MatchRegisterName(Tok.getString());
@@ -485,16 +569,182 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
}
}
- if (RegNo == 0)
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax()) return true;
return Error(StartLoc, "invalid register name",
SMRange(StartLoc, Tok.getEndLoc()));
+ }
EndLoc = Tok.getEndLoc();
Parser.Lex(); // Eat identifier token.
return false;
}
-X86Operand *X86ATTAsmParser::ParseOperand() {
+X86Operand *X86AsmParser::ParseOperand() {
+ if (isParsingIntelSyntax())
+ return ParseIntelOperand();
+ return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+ unsigned Size = 0;
+ if (OpStr == "BYTE") Size = 8;
+ if (OpStr == "WORD") Size = 16;
+ if (OpStr == "DWORD") Size = 32;
+ if (OpStr == "QWORD") Size = 64;
+ if (OpStr == "XWORD") Size = 80;
+ if (OpStr == "XMMWORD") Size = 128;
+ if (OpStr == "YMMWORD") Size = 256;
+ return Size;
+}
+
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
+ unsigned Size) {
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+ SMLoc Start = Parser.getTok().getLoc(), End;
+
+ const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+ // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
+
+ // Eat '['
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(Start, "Expected '[' token!");
+ Parser.Lex();
+
+ if (getLexer().is(AsmToken::Identifier)) {
+ // Parse BaseReg
+ if (ParseRegister(BaseReg, Start, End)) {
+ // Handle '[' 'symbol' ']'
+ if (getParser().ParseExpression(Disp, End)) return 0;
+ if (getLexer().isNot(AsmToken::RBrac))
+ return ErrorOperand(Start, "Expected ']' token!");
+ Parser.Lex();
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+ }
+ } else if (getLexer().is(AsmToken::Integer)) {
+ int64_t Val = Parser.getTok().getIntVal();
+ Parser.Lex();
+ SMLoc Loc = Parser.getTok().getLoc();
+ if (getLexer().is(AsmToken::RBrac)) {
+ // Handle '[' number ']'
+ Parser.Lex();
+ const MCExpr *Disp = MCConstantExpr::Create(Val, getContext());
+ if (SegReg)
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale,
+ Start, End, Size);
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+ } else if (getLexer().is(AsmToken::Star)) {
+ // Handle '[' Scale*IndexReg ']'
+ Parser.Lex();
+ SMLoc IdxRegLoc = Parser.getTok().getLoc();
+ if (ParseRegister(IndexReg, IdxRegLoc, End))
+ return ErrorOperand(IdxRegLoc, "Expected register");
+ Scale = Val;
+ } else
+ return ErrorOperand(Loc, "Unepxeted token");
+ }
+
+ if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
+ bool isPlus = getLexer().is(AsmToken::Plus);
+ Parser.Lex();
+ SMLoc PlusLoc = Parser.getTok().getLoc();
+ if (getLexer().is(AsmToken::Integer)) {
+ int64_t Val = Parser.getTok().getIntVal();
+ Parser.Lex();
+ if (getLexer().is(AsmToken::Star)) {
+ Parser.Lex();
+ SMLoc IdxRegLoc = Parser.getTok().getLoc();
+ if (ParseRegister(IndexReg, IdxRegLoc, End))
+ return ErrorOperand(IdxRegLoc, "Expected register");
+ Scale = Val;
+ } else if (getLexer().is(AsmToken::RBrac)) {
+ const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
+ Disp = isPlus ? ValExpr : MCConstantExpr::Create(0-Val, getContext());
+ } else
+ return ErrorOperand(PlusLoc, "unexpected token after +");
+ } else if (getLexer().is(AsmToken::Identifier)) {
+ // This could be an index register or a displacement expression.
+ End = Parser.getTok().getLoc();
+ if (!IndexReg)
+ ParseRegister(IndexReg, Start, End);
+ else if (getParser().ParseExpression(Disp, End)) return 0;
+ }
+ }
+
+ if (getLexer().isNot(AsmToken::RBrac))
+ if (getParser().ParseExpression(Disp, End)) return 0;
+
+ End = Parser.getTok().getLoc();
+ if (getLexer().isNot(AsmToken::RBrac))
+ return ErrorOperand(End, "expected ']' token!");
+ Parser.Lex();
+ End = Parser.getTok().getLoc();
+
+ // handle [-42]
+ if (!BaseReg && !IndexReg)
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+ Start, End, Size);
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand() {
+ const AsmToken &Tok = Parser.getTok();
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ unsigned SegReg = 0;
+
+ unsigned Size = getIntelMemOperandSize(Tok.getString());
+ if (Size) {
+ Parser.Lex();
+ assert (Tok.getString() == "PTR" && "Unexpected token!");
+ Parser.Lex();
+ }
+
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(SegReg, Size);
+
+ if (!ParseRegister(SegReg, Start, End)) {
+ // Handel SegReg : [ ... ]
+ if (getLexer().isNot(AsmToken::Colon))
+ return ErrorOperand(Start, "Expected ':' token!");
+ Parser.Lex(); // Eat :
+ if (getLexer().isNot(AsmToken::LBrac))
+ return ErrorOperand(Start, "Expected '[' token!");
+ return ParseIntelBracExpression(SegReg, Size);
+ }
+
+ const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+ if (getParser().ParseExpression(Disp, End)) return 0;
+ return X86Operand::CreateMem(Disp, Start, End, Size);
+}
+
+X86Operand *X86AsmParser::ParseIntelOperand() {
+ SMLoc Start = Parser.getTok().getLoc(), End;
+
+ // immediate.
+ if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
+ getLexer().is(AsmToken::Minus)) {
+ const MCExpr *Val;
+ if (!getParser().ParseExpression(Val, End)) {
+ End = Parser.getTok().getLoc();
+ return X86Operand::CreateImm(Val, Start, End);
+ }
+ }
+
+ // register
+ unsigned RegNo = 0;
+ if (!ParseRegister(RegNo, Start, End)) {
+ End = Parser.getTok().getLoc();
+ return X86Operand::CreateReg(RegNo, Start, End);
+ }
+
+ // mem operand
+ return ParseIntelMemOperand();
+}
+
+X86Operand *X86AsmParser::ParseATTOperand() {
switch (getLexer().getKind()) {
default:
// Parse a memory operand with no segment register.
@@ -533,7 +783,7 @@ X86Operand *X86ATTAsmParser::ParseOperand() {
/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
/// has already been parsed if present.
-X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
// We have to disambiguate a parenthesized expression "(4+5)" from the start
// of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
@@ -664,7 +914,7 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
MemStart, MemEnd);
}
-bool X86ATTAsmParser::
+bool X86AsmParser::
ParseInstruction(StringRef Name, SMLoc NameLoc,
SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
StringRef PatchedName = Name;
@@ -734,10 +984,9 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
- if (ExtraImmOp)
+ if (ExtraImmOp && !isParsingIntelSyntax())
Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
-
// Determine whether this is an instruction prefix.
bool isPrefix =
Name == "lock" || Name == "rep" ||
@@ -791,6 +1040,9 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
else if (isPrefix && getLexer().is(AsmToken::Slash))
Parser.Lex(); // Consume the prefix separator Slash
+ if (ExtraImmOp && isParsingIntelSyntax())
+ Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
// This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
// "outb %al, %dx". Out doesn't take a memory form, but this is a widely
// documented form in various unofficial manuals, so a lot of code uses it.
@@ -926,11 +1178,21 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
Name.startswith("rcl") || Name.startswith("rcr") ||
Name.startswith("rol") || Name.startswith("ror")) &&
Operands.size() == 3) {
- X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
- if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
- delete Operands[1];
- Operands.erase(Operands.begin() + 1);
+ if (isParsingIntelSyntax()) {
+ // Intel syntax
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
+ if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[2];
+ Operands.pop_back();
+ }
+ } else {
+ X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+ if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[1];
+ Operands.erase(Operands.begin() + 1);
+ }
}
}
@@ -949,7 +1211,246 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
return false;
}
-bool X86ATTAsmParser::
+bool X86AsmParser::
+processInstruction(MCInst &Inst,
+ const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+ switch (Inst.getOpcode()) {
+ default: return false;
+ case X86::AND16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::AND16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::AND32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::AND32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::AND64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::AND64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::XOR16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::XOR16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::XOR32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::XOR32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::XOR64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::XOR64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::OR16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::OR16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::OR32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::OR32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::OR64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::OR64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::CMP16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::CMP16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::CMP32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::CMP32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::CMP64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::CMP64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::ADD16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::ADD16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::ADD32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::ADD32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::ADD64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::ADD64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::SUB16i16: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::SUB16ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::AX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::SUB32i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::SUB32ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::EAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::SUB64i32: {
+ if (!Inst.getOperand(0).isImm() ||
+ !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::SUB64ri8);
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(MCOperand::CreateReg(X86::RAX));
+ TmpInst.addOperand(Inst.getOperand(0));
+ Inst = TmpInst;
+ return true;
+ }
+ }
+}
+
+bool X86AsmParser::
MatchAndEmitInstruction(SMLoc IDLoc,
SmallVectorImpl<MCParsedAsmOperand*> &Operands,
MCStreamer &Out) {
@@ -967,6 +1468,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
MCInst Inst;
Inst.setOpcode(X86::WAIT);
+ Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst);
const char *Repl =
@@ -990,9 +1492,17 @@ MatchAndEmitInstruction(SMLoc IDLoc,
MCInst Inst;
// First, try a direct match.
- switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo)) {
+ switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo,
+ isParsingIntelSyntax())) {
default: break;
case Match_Success:
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ while (processInstruction(Inst, Operands))
+ ;
+
+ Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst);
return false;
case Match_MissingFeature:
@@ -1050,6 +1560,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
(Match1 == Match_Success) + (Match2 == Match_Success) +
(Match3 == Match_Success) + (Match4 == Match_Success);
if (NumSuccessfulMatches == 1) {
+ Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst);
return false;
}
@@ -1130,18 +1641,29 @@ MatchAndEmitInstruction(SMLoc IDLoc,
}
-bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getIdentifier();
if (IDVal == ".word")
return ParseDirectiveWord(2, DirectiveID.getLoc());
else if (IDVal.startswith(".code"))
return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+ else if (IDVal.startswith(".intel_syntax")) {
+ getParser().setAssemblerDialect(1);
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ if(Parser.getTok().getString() == "noprefix") {
+ // FIXME : Handle noprefix
+ Parser.Lex();
+ } else
+ return true;
+ }
+ return false;
+ }
return true;
}
/// ParseDirectiveWord
/// ::= .word [ expression (, expression)* ]
-bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
@@ -1166,7 +1688,7 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
/// ParseDirectiveCode
/// ::= .code32 | .code64
-bool X86ATTAsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
if (IDVal == ".code32") {
Parser.Lex();
if (is64BitMode()) {
@@ -1191,8 +1713,8 @@ extern "C" void LLVMInitializeX86AsmLexer();
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
- RegisterMCAsmParser<X86ATTAsmParser> X(TheX86_32Target);
- RegisterMCAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
+ RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+ RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
LLVMInitializeX86AsmLexer();
}
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index be15899..f612e23 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -26,6 +26,7 @@ set(sources
X86InstrInfo.cpp
X86JITInfo.cpp
X86MCInstLower.cpp
+ X86MachineFunctionInfo.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
X86Subtarget.cpp
diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt
index cac7adf..0609f3c 100644
--- a/lib/Target/X86/Disassembler/LLVMBuild.txt
+++ b/lib/Target/X86/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = X86Disassembler
parent = X86
-required_libraries = MC Support X86Info
+required_libraries = MC Support X86Desc X86Info
add_to_library_groups = X86
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 3aacb20e..8278bde 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- X86Disassembler.cpp - Disassembler for x86 and x86_64 ----*- C++ -*-===//
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -18,9 +18,11 @@
#include "X86DisassemblerDecoder.h"
#include "llvm/MC/EDInstInfo.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MemoryObject.h"
@@ -42,6 +44,11 @@ void x86DisassemblerDebug(const char *file,
dbgs() << file << ":" << line << ": " << s;
}
+const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) {
+ const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+ return MII->getName(Opcode);
+}
+
#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
namespace llvm {
@@ -65,17 +72,19 @@ extern Target TheX86_32Target, TheX86_64Target;
}
static bool translateInstruction(MCInst &target,
- InternalInstruction &source);
+ InternalInstruction &source,
+ const MCDisassembler *Dis);
-X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode) :
- MCDisassembler(STI),
- fMode(mode) {
-}
+X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
+ DisassemblerMode mode,
+ const MCInstrInfo *MII)
+ : MCDisassembler(STI), MII(MII), fMode(mode) {}
X86GenericDisassembler::~X86GenericDisassembler() {
+ delete MII;
}
-EDInstInfo *X86GenericDisassembler::getEDInfo() const {
+const EDInstInfo *X86GenericDisassembler::getEDInfo() const {
return instInfoX86;
}
@@ -116,6 +125,8 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
uint64_t address,
raw_ostream &vStream,
raw_ostream &cStream) const {
+ CommentStream = &cStream;
+
InternalInstruction internalInstr;
dlog_t loggerFn = logger;
@@ -127,6 +138,7 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
(void*)&region,
loggerFn,
(void*)&vStream,
+ (void*)MII,
address,
fMode);
@@ -136,7 +148,8 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
}
else {
size = internalInstr.length;
- return (!translateInstruction(instr, internalInstr)) ? Success : Fail;
+ return (!translateInstruction(instr, internalInstr, this)) ?
+ Success : Fail;
}
}
@@ -161,6 +174,140 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
}
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value - The immediate Value, has had any PC adjustment made by
+/// the caller.
+/// @param isBranch - If the instruction is a branch instruction
+/// @param Address - The starting address of the instruction
+/// @param Offset - The byte offset to this immediate in the instruction
+/// @param Width - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width. If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created. This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+ uint64_t Address, uint64_t Offset,
+ uint64_t Width, MCInst &MI,
+ const MCDisassembler *Dis) {
+ LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+ struct LLVMOpInfo1 SymbolicOp;
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ SymbolicOp.Value = Value;
+ void *DisInfo = Dis->getDisInfoBlock();
+
+ if (!getOpInfo ||
+ !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
+ // Clear SymbolicOp.Value from above and also all other fields.
+ memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+ LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+ if (!SymbolLookUp)
+ return false;
+ uint64_t ReferenceType;
+ if (isBranch)
+ ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+ else
+ ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+ const char *ReferenceName;
+ const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+ &ReferenceName);
+ if (Name) {
+ SymbolicOp.AddSymbol.Name = Name;
+ SymbolicOp.AddSymbol.Present = true;
+ }
+ // For branches always create an MCExpr so it gets printed as hex address.
+ else if (isBranch) {
+ SymbolicOp.Value = Value;
+ }
+ if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+ (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
+ if (!Name && !isBranch)
+ return false;
+ }
+
+ MCContext *Ctx = Dis->getMCContext();
+ const MCExpr *Add = NULL;
+ if (SymbolicOp.AddSymbol.Present) {
+ if (SymbolicOp.AddSymbol.Name) {
+ StringRef Name(SymbolicOp.AddSymbol.Name);
+ MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+ Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+ } else {
+ Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
+ }
+ }
+
+ const MCExpr *Sub = NULL;
+ if (SymbolicOp.SubtractSymbol.Present) {
+ if (SymbolicOp.SubtractSymbol.Name) {
+ StringRef Name(SymbolicOp.SubtractSymbol.Name);
+ MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+ Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+ } else {
+ Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
+ }
+ }
+
+ const MCExpr *Off = NULL;
+ if (SymbolicOp.Value != 0)
+ Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+ const MCExpr *Expr;
+ if (Sub) {
+ const MCExpr *LHS;
+ if (Add)
+ LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+ else
+ LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+ if (Off != 0)
+ Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+ else
+ Expr = LHS;
+ } else if (Add) {
+ if (Off != 0)
+ Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+ else
+ Expr = Add;
+ } else {
+ if (Off != 0)
+ Expr = Off;
+ else
+ Expr = MCConstantExpr::Create(0, *Ctx);
+ }
+
+ MI.addOperand(MCOperand::CreateExpr(Expr));
+
+ return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool. The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry. The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+ const void *Decoder) {
+ const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+ LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+ if (SymbolLookUp) {
+ void *DisInfo = Dis->getDisInfoBlock();
+ uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+ const char *ReferenceName;
+ (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+ if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+ (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+ }
+}
+
/// translateImmediate - Appends an immediate operand to an MCInst.
///
/// @param mcInst - The MCInst to append to.
@@ -169,10 +316,11 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
/// @param insn - The internal instruction.
static void translateImmediate(MCInst &mcInst, uint64_t immediate,
const OperandSpecifier &operand,
- InternalInstruction &insn) {
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
// Sign-extend the immediate if necessary.
- OperandType type = operand.type;
+ OperandType type = (OperandType)operand.type;
if (type == TYPE_RELv) {
switch (insn.displacementSize) {
@@ -225,6 +373,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
}
}
+ bool isBranch = false;
+ uint64_t pcrel = 0;
switch (type) {
case TYPE_XMM128:
mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
@@ -232,8 +382,11 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case TYPE_XMM256:
mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
return;
- case TYPE_MOFFS8:
case TYPE_REL8:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ // fall through to sign extend the immediate if needed.
+ case TYPE_MOFFS8:
if(immediate & 0x80)
immediate |= ~(0xffull);
break;
@@ -241,9 +394,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
if(immediate & 0x8000)
immediate |= ~(0xffffull);
break;
- case TYPE_MOFFS32:
case TYPE_REL32:
case TYPE_REL64:
+ isBranch = true;
+ pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+ // fall through to sign extend the immediate if needed.
+ case TYPE_MOFFS32:
if(immediate & 0x80000000)
immediate |= ~(0xffffffffull);
break;
@@ -253,7 +409,10 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
break;
}
- mcInst.addOperand(MCOperand::CreateImm(immediate));
+ if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+ insn.immediateOffset, insn.immediateSize,
+ mcInst, Dis))
+ mcInst.addOperand(MCOperand::CreateImm(immediate));
}
/// translateRMRegister - Translates a register stored in the R/M field of the
@@ -300,7 +459,8 @@ static bool translateRMRegister(MCInst &mcInst,
/// @param insn - The instruction to extract Mod, R/M, and SIB fields
/// from.
/// @return - 0 on success; nonzero otherwise
-static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+ const MCDisassembler *Dis) {
// Addresses in an MCInst are represented as five operands:
// 1. basereg (register) The R/M base, or (if there is a SIB) the
// SIB base
@@ -318,6 +478,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
MCOperand indexReg;
MCOperand displacement;
MCOperand segmentReg;
+ uint64_t pcrel = 0;
if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
if (insn.sibBase != SIB_BASE_NONE) {
@@ -359,8 +520,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
return true;
}
- if (insn.mode == MODE_64BIT)
+ if (insn.mode == MODE_64BIT){
+ pcrel = insn.startLocation +
+ insn.displacementOffset + insn.displacementSize;
+ tryAddingPcLoadReferenceComment(insn.startLocation +
+ insn.displacementOffset,
+ insn.displacement + pcrel, Dis);
baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+ }
else
baseReg = MCOperand::CreateReg(0);
@@ -426,7 +593,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
mcInst.addOperand(baseReg);
mcInst.addOperand(scaleAmount);
mcInst.addOperand(indexReg);
- mcInst.addOperand(displacement);
+ if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+ insn.startLocation, insn.displacementOffset,
+ insn.displacementSize, mcInst, Dis))
+ mcInst.addOperand(displacement);
mcInst.addOperand(segmentReg);
return false;
}
@@ -440,7 +610,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) {
/// from.
/// @return - 0 on success; nonzero otherwise
static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
- InternalInstruction &insn) {
+ InternalInstruction &insn, const MCDisassembler *Dis) {
switch (operand.type) {
default:
debug("Unexpected type for a R/M operand");
@@ -480,7 +650,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_M1632:
case TYPE_M1664:
case TYPE_LEA:
- return translateRMMemory(mcInst, insn);
+ return translateRMMemory(mcInst, insn, Dis);
}
}
@@ -510,7 +680,8 @@ static bool translateFPRegister(MCInst &mcInst,
/// @param insn - The internal instruction.
/// @return - false on success; true otherwise.
static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
- InternalInstruction &insn) {
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
switch (operand.encoding) {
default:
debug("Unhandled operand encoding during translation");
@@ -519,7 +690,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
translateRegister(mcInst, insn.reg);
return false;
case ENCODING_RM:
- return translateRM(mcInst, operand, insn);
+ return translateRM(mcInst, operand, insn, Dis);
case ENCODING_CB:
case ENCODING_CW:
case ENCODING_CD:
@@ -537,7 +708,8 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
translateImmediate(mcInst,
insn.immediates[insn.numImmediatesTranslated++],
operand,
- insn);
+ insn,
+ Dis);
return false;
case ENCODING_RB:
case ENCODING_RW:
@@ -556,7 +728,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
case ENCODING_DUP:
return translateOperand(mcInst,
insn.spec->operands[operand.type - TYPE_DUP0],
- insn);
+ insn, Dis);
}
}
@@ -567,7 +739,8 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
/// @param insn - The internal instruction.
/// @return - false on success; true otherwise.
static bool translateInstruction(MCInst &mcInst,
- InternalInstruction &insn) {
+ InternalInstruction &insn,
+ const MCDisassembler *Dis) {
if (!insn.spec) {
debug("Instruction has no specification");
return true;
@@ -581,7 +754,7 @@ static bool translateInstruction(MCInst &mcInst,
for (index = 0; index < X86_MAX_OPERANDS; ++index) {
if (insn.spec->operands[index].encoding != ENCODING_NONE) {
- if (translateOperand(mcInst, insn.spec->operands[index], insn)) {
+ if (translateOperand(mcInst, insn.spec->operands[index], insn, Dis)) {
return true;
}
}
@@ -590,12 +763,16 @@ static bool translateInstruction(MCInst &mcInst,
return false;
}
-static MCDisassembler *createX86_32Disassembler(const Target &T, const MCSubtargetInfo &STI) {
- return new X86Disassembler::X86_32Disassembler(STI);
+static MCDisassembler *createX86_32Disassembler(const Target &T,
+ const MCSubtargetInfo &STI) {
+ return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT,
+ T.createMCInstrInfo());
}
-static MCDisassembler *createX86_64Disassembler(const Target &T, const MCSubtargetInfo &STI) {
- return new X86Disassembler::X86_64Disassembler(STI);
+static MCDisassembler *createX86_64Disassembler(const Target &T,
+ const MCSubtargetInfo &STI) {
+ return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT,
+ T.createMCInstrInfo());
}
extern "C" void LLVMInitializeX86Disassembler() {
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 6ac9a0f..c11f51c 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -1,4 +1,4 @@
-//===- X86Disassembler.h - Disassembler for x86 and x86_64 ------*- C++ -*-===//
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -78,7 +78,7 @@
const char* name;
#define INSTRUCTION_IDS \
- const InstrUID *instructionIDs;
+ unsigned instructionIDs;
#include "X86DisassemblerDecoderCommon.h"
@@ -87,11 +87,10 @@
#include "llvm/MC/MCDisassembler.h"
-struct InternalInstruction;
-
namespace llvm {
class MCInst;
+class MCInstrInfo;
class MCSubtargetInfo;
class MemoryObject;
class raw_ostream;
@@ -104,13 +103,16 @@ namespace X86Disassembler {
/// All each platform class should have to do is subclass the constructor, and
/// provide a different disassemblerMode value.
class X86GenericDisassembler : public MCDisassembler {
-protected:
+ const MCInstrInfo *MII;
+public:
/// Constructor - Initializes the disassembler.
///
/// @param mode - The X86 architecture mode to decode for.
- X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode);
-public:
+ X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode,
+ const MCInstrInfo *MII);
+private:
~X86GenericDisassembler();
+public:
/// getInstruction - See MCDisassembler.
DecodeStatus getInstruction(MCInst &instr,
@@ -121,37 +123,13 @@ public:
raw_ostream &cStream) const;
/// getEDInfo - See MCDisassembler.
- EDInstInfo *getEDInfo() const;
+ const EDInstInfo *getEDInfo() const;
private:
DisassemblerMode fMode;
};
-/// X86_16Disassembler - 16-bit X86 disassembler.
-class X86_16Disassembler : public X86GenericDisassembler {
-public:
- X86_16Disassembler(const MCSubtargetInfo &STI) :
- X86GenericDisassembler(STI, MODE_16BIT) {
- }
-};
-
-/// X86_16Disassembler - 32-bit X86 disassembler.
-class X86_32Disassembler : public X86GenericDisassembler {
-public:
- X86_32Disassembler(const MCSubtargetInfo &STI) :
- X86GenericDisassembler(STI, MODE_32BIT) {
- }
-};
-
-/// X86_16Disassembler - 64-bit X86 disassembler.
-class X86_64Disassembler : public X86GenericDisassembler {
-public:
- X86_64Disassembler(const MCSubtargetInfo &STI) :
- X86GenericDisassembler(STI, MODE_64BIT) {
- }
-};
-
} // namespace X86Disassembler
-
+
} // namespace llvm
-
+
#endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 1a24807..b0e66f0 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -1,4 +1,4 @@
-/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==*
+/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
*
* The LLVM Compiler Infrastructure
*
@@ -82,11 +82,9 @@ static int modRMRequired(OpcodeType type,
decision = &THREEBYTEA7_SYM;
break;
}
-
+
return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
modrm_type != MODRM_ONEENTRY;
-
- return 0;
}
/*
@@ -103,12 +101,9 @@ static InstrUID decode(OpcodeType type,
InstructionContext insnContext,
uint8_t opcode,
uint8_t modRM) {
- const struct ModRMDecision* dec;
+ const struct ModRMDecision* dec = 0;
switch (type) {
- default:
- debug("Unknown opcode type");
- return 0;
case ONEBYTE:
dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
break;
@@ -134,14 +129,17 @@ static InstrUID decode(OpcodeType type,
debug("Corrupt table! Unknown modrm_type");
return 0;
case MODRM_ONEENTRY:
- return dec->instructionIDs[0];
+ return modRMTable[dec->instructionIDs];
case MODRM_SPLITRM:
if (modFromModRM(modRM) == 0x3)
- return dec->instructionIDs[1];
- else
- return dec->instructionIDs[0];
+ return modRMTable[dec->instructionIDs+1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+ return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
case MODRM_FULL:
- return dec->instructionIDs[modRM];
+ return modRMTable[dec->instructionIDs+modRM];
}
}
@@ -712,7 +710,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
* @return - 0 if the ModR/M could be read when needed or was not needed;
* nonzero otherwise.
*/
-static int getID(struct InternalInstruction* insn) {
+static int getID(struct InternalInstruction* insn, void *miiArg) {
uint8_t attrMask;
uint16_t instructionID;
@@ -765,6 +763,8 @@ static int getID(struct InternalInstruction* insn) {
else {
if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
attrMask |= ATTR_OPSIZE;
+ else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_ADSIZE;
else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
attrMask |= ATTR_XS;
else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
@@ -826,7 +826,7 @@ static int getID(struct InternalInstruction* insn) {
const struct InstructionSpecifier *spec;
uint16_t instructionIDWithOpsize;
- const struct InstructionSpecifier *specWithOpsize;
+ const char *specName, *specWithOpSizeName;
spec = specifierForUID(instructionID);
@@ -843,11 +843,13 @@ static int getID(struct InternalInstruction* insn) {
return 0;
}
- specWithOpsize = specifierForUID(instructionIDWithOpsize);
-
- if (is16BitEquvalent(spec->name, specWithOpsize->name)) {
+ specName = x86DisassemblerGetInstrName(instructionID, miiArg);
+ specWithOpSizeName =
+ x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
+
+ if (is16BitEquvalent(specName, specWithOpSizeName)) {
insn->instructionID = instructionIDWithOpsize;
- insn->spec = specWithOpsize;
+ insn->spec = specifierForUID(instructionIDWithOpsize);
} else {
insn->instructionID = instructionID;
insn->spec = spec;
@@ -1014,6 +1016,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
return 0;
insn->consumedDisplacement = TRUE;
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
switch (insn->eaDisplacement) {
case EA_DISP_NONE:
@@ -1410,6 +1413,7 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
size = insn->immediateSize;
else
insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
switch (size) {
case 1:
@@ -1472,6 +1476,7 @@ static int readVVVV(struct InternalInstruction* insn) {
static int readOperands(struct InternalInstruction* insn) {
int index;
int hasVVVV, needVVVV;
+ int sawRegImm = 0;
dbgprintf(insn, "readOperands()");
@@ -1500,11 +1505,22 @@ static int readOperands(struct InternalInstruction* insn) {
dbgprintf(insn, "We currently don't hande code-offset encodings");
return -1;
case ENCODING_IB:
+ if (sawRegImm) {
+ /* Saw a register immediate so don't read again and instead split the
+ previous immediate. FIXME: This is a hack. */
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
if (readImmediate(insn, 1))
return -1;
if (insn->spec->operands[index].type == TYPE_IMM3 &&
insn->immediates[insn->numImmediatesConsumed - 1] > 7)
return -1;
+ if (insn->spec->operands[index].type == TYPE_XMM128 ||
+ insn->spec->operands[index].type == TYPE_XMM256)
+ sawRegImm = 1;
break;
case ENCODING_IW:
if (readImmediate(insn, 2))
@@ -1596,6 +1612,7 @@ int decodeInstruction(struct InternalInstruction* insn,
void* readerArg,
dlog_t logger,
void* loggerArg,
+ void* miiArg,
uint64_t startLoc,
DisassemblerMode mode) {
memset(insn, 0, sizeof(struct InternalInstruction));
@@ -1611,7 +1628,7 @@ int decodeInstruction(struct InternalInstruction* insn,
if (readPrefixes(insn) ||
readOpcode(insn) ||
- getID(insn) ||
+ getID(insn, miiArg) ||
insn->instructionID == 0 ||
readOperands(insn))
return -1;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index a9c90f8..fae309b 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,4 +1,4 @@
-/*===- X86DisassemblerDecoderInternal.h - Disassembler decoder -----*- C -*-==*
+/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
*
* The LLVM Compiler Infrastructure
*
@@ -20,11 +20,10 @@
extern "C" {
#endif
-#define INSTRUCTION_SPECIFIER_FIELDS \
- const char* name;
+#define INSTRUCTION_SPECIFIER_FIELDS
#define INSTRUCTION_IDS \
- const InstrUID *instructionIDs;
+ unsigned instructionIDs;
#include "X86DisassemblerDecoderCommon.h"
@@ -460,6 +459,11 @@ struct InternalInstruction {
uint8_t addressSize;
uint8_t displacementSize;
uint8_t immediateSize;
+
+ /* Offsets from the start of the instruction to the pieces of data, which is
+ needed to find relocation entries for adding symbolic operands */
+ uint8_t displacementOffset;
+ uint8_t immediateOffset;
/* opcode state */
@@ -554,6 +558,7 @@ int decodeInstruction(struct InternalInstruction* insn,
void* readerArg,
dlog_t logger,
void* loggerArg,
+ void* miiArg,
uint64_t startLoc,
DisassemblerMode mode);
@@ -568,6 +573,8 @@ void x86DisassemblerDebug(const char *file,
unsigned line,
const char *s);
+const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index a7ef0cc..d2e30f1 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -1,4 +1,4 @@
-/*===- X86DisassemblerDecoderCommon.h - Disassembler decoder -------*- C -*-==*
+/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
*
* The LLVM Compiler Infrastructure
*
@@ -54,8 +54,9 @@
ENUM_ENTRY(ATTR_XD, 0x04) \
ENUM_ENTRY(ATTR_REXW, 0x08) \
ENUM_ENTRY(ATTR_OPSIZE, 0x10) \
- ENUM_ENTRY(ATTR_VEX, 0x20) \
- ENUM_ENTRY(ATTR_VEXL, 0x40)
+ ENUM_ENTRY(ATTR_ADSIZE, 0x20) \
+ ENUM_ENTRY(ATTR_VEX, 0x40) \
+ ENUM_ENTRY(ATTR_VEXL, 0x80)
#define ENUM_ENTRY(n, v) n = v,
enum attributeBits {
@@ -77,6 +78,8 @@ enum attributeBits {
"64-bit mode but no more") \
ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
"operands change width") \
+ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \
+ "operands change width") \
ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
"but not the operands") \
ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
@@ -88,6 +91,7 @@ enum attributeBits {
ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\
"change width; overrides IC_OPSIZE") \
ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \
ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \
"secondary") \
ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \
@@ -156,6 +160,8 @@ typedef uint16_t InstrUID;
* MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
* corresponds to one instruction; otherwise, it corresponds to
* a different instruction.
+ * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+ corresponds to instructions that use reg field as opcode
* MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
* to a different instruction.
*/
@@ -163,6 +169,7 @@ typedef uint16_t InstrUID;
#define MODRMTYPES \
ENUM_ENTRY(MODRM_ONEENTRY) \
ENUM_ENTRY(MODRM_SPLITRM) \
+ ENUM_ENTRY(MODRM_SPLITREG) \
ENUM_ENTRY(MODRM_FULL)
#define ENUM_ENTRY(n) n,
@@ -336,8 +343,8 @@ typedef enum {
* operand.
*/
struct OperandSpecifier {
- OperandEncoding encoding;
- OperandType type;
+ uint8_t encoding;
+ uint8_t type;
};
/*
@@ -364,7 +371,7 @@ typedef enum {
* its operands.
*/
struct InstructionSpecifier {
- ModifierType modifierType;
+ uint8_t modifierType;
uint8_t modifierBase;
struct OperandSpecifier operands[X86_MAX_OPERANDS];
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 8f26d9f..b7ccb4c 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -30,10 +30,6 @@ using namespace llvm;
#define PRINT_ALIAS_INSTR
#include "X86GenAsmWriter.inc"
-X86ATTInstPrinter::X86ATTInstPrinter(const MCAsmInfo &MAI)
- : MCInstPrinter(MAI) {
-}
-
void X86ATTInstPrinter::printRegName(raw_ostream &OS,
unsigned RegNo) const {
OS << '%' << getRegisterName(RegNo);
@@ -45,11 +41,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
if (!printAliasInstr(MI, OS))
printInstruction(MI, OS);
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
// If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream) {
- printAnnotation(OS, Annot);
+ if (CommentStream)
EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
- }
}
StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
@@ -59,15 +56,39 @@ StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
raw_ostream &O) {
switch (MI->getOperand(Op).getImm()) {
- default: assert(0 && "Invalid ssecc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
+ default: llvm_unreachable("Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
}
}
@@ -79,11 +100,21 @@ void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isImm())
- // Print this as a signed 32-bit value.
- O << (int)Op.getImm();
+ O << Op.getImm();
else {
assert(Op.isExpr() && "unknown pcrel immediate operand");
- O << *Op.getExpr();
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ O << *Op.getExpr();
+ }
}
}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 0293869..ff94301 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===//
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -22,8 +22,9 @@ class MCOperand;
class X86ATTInstPrinter : public MCInstPrinter {
public:
- X86ATTInstPrinter(const MCAsmInfo &MAI);
-
+ X86ATTInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MRI) {}
+
virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
virtual StringRef getOpcodeName(unsigned Opcode) const;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 6e4b1b9..30a847f 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -76,10 +76,19 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSHUFDmi:
case X86::VPSHUFDmi:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodePSHUFMask(4, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFDYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
break;
+
case X86::PSHUFHWri:
case X86::VPSHUFHWri:
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -437,31 +446,31 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VPERMILPSmi:
- DecodeVPERMILPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
- ShuffleMask);
+ DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::VPERMILPSYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VPERMILPSYmi:
- DecodeVPERMILPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
- ShuffleMask);
+ DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::VPERMILPDri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VPERMILPDmi:
- DecodeVPERMILPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
- ShuffleMask);
+ DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::VPERMILPDYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VPERMILPDYmi:
- DecodeVPERMILPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+ DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -471,7 +480,9 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
// FALL THROUGH.
case X86::VPERM2F128rm:
case X86::VPERM2I128rm:
- DecodeVPERM2F128Mask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ DecodeVPERM2X128Mask(MVT::v4i64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h
index 6b86db4..13fdf9a 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -1,4 +1,4 @@
-//===-- X86InstComments.h - Generate verbose-asm comments for instrs ------===//
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index f9ab5ae..46a96d2 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -35,12 +35,13 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot) {
printInstruction(MI, OS);
-
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+
// If verbose assembly is enabled, we can print some informative comments.
- if (CommentStream) {
- printAnnotation(OS, Annot);
+ if (CommentStream)
EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
- }
}
StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
return getInstructionName(Opcode);
@@ -49,15 +50,40 @@ StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
raw_ostream &O) {
switch (MI->getOperand(Op).getImm()) {
- default: assert(0 && "Invalid ssecc argument!");
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
+ default: llvm_unreachable("Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
+
}
}
@@ -70,7 +96,18 @@ void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
O << Op.getImm();
else {
assert(Op.isExpr() && "unknown pcrel immediate operand");
- O << *Op.getExpr();
+ // If a symbolic branch target was added as a constant expression then print
+ // that address in hex.
+ const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+ int64_t Address;
+ if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+ O << "0x";
+ O.write_hex(Address);
+ }
+ else {
+ // Otherwise, just print the expression.
+ O << *Op.getExpr();
+ }
}
}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 6d5ec62..ea1d38a 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===//
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -23,8 +23,8 @@ class MCOperand;
class X86IntelInstPrinter : public MCInstPrinter {
public:
- X86IntelInstPrinter(const MCAsmInfo &MAI)
- : MCInstPrinter(MAI) {}
+ X86IntelInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
+ : MCInstPrinter(MAI, MRI) {}
virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index ab2ebb4..1c240e5 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -2,8 +2,10 @@ add_llvm_library(LLVMX86Desc
X86AsmBackend.cpp
X86MCTargetDesc.cpp
X86MCAsmInfo.cpp
- X86MCCodeEmitter.cpp
+ X86MCCodeEmitter.cpp
X86MachObjectWriter.cpp
+ X86ELFObjectWriter.cpp
+ X86WinCOFFObjectWriter.cpp
)
add_dependencies(LLVMX86Desc X86CommonTableGen)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 87b2b05..9ccbf1c 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -37,18 +37,22 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
static unsigned getFixupKindLog2Size(unsigned Kind) {
switch (Kind) {
- default: assert(0 && "invalid fixup kind!");
+ default: llvm_unreachable("invalid fixup kind!");
case FK_PCRel_1:
+ case FK_SecRel_1:
case FK_Data_1: return 0;
case FK_PCRel_2:
+ case FK_SecRel_2:
case FK_Data_2: return 1;
case FK_PCRel_4:
case X86::reloc_riprel_4byte:
case X86::reloc_riprel_4byte_movq_load:
case X86::reloc_signed_4byte:
case X86::reloc_global_offset_table:
+ case FK_SecRel_4:
case FK_Data_4: return 2;
case FK_PCRel_8:
+ case FK_SecRel_8:
case FK_Data_8: return 3;
}
}
@@ -57,9 +61,9 @@ namespace {
class X86ELFObjectWriter : public MCELFObjectTargetWriter {
public:
- X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine,
- bool HasRelocationAddend)
- : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {}
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+ bool HasRelocationAddend, bool foobar)
+ : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
};
class X86AsmBackend : public MCAsmBackend {
@@ -87,7 +91,7 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value) const {
unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
@@ -105,16 +109,16 @@ public:
Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
}
- bool MayNeedRelaxation(const MCInst &Inst) const;
+ bool mayNeedRelaxation(const MCInst &Inst) const;
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value,
const MCInstFragment *DF,
const MCAsmLayout &Layout) const;
- void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
- bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
};
} // end anonymous namespace
@@ -219,7 +223,7 @@ static unsigned getRelaxedOpcode(unsigned Op) {
return getRelaxedOpcodeBranch(Op);
}
-bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
// Branches can always be relaxed.
if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
return true;
@@ -259,7 +263,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
// FIXME: Can tblgen help at all here to verify there aren't other instructions
// we can relax?
-void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
// The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
@@ -275,10 +279,10 @@ void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
Res.setOpcode(RelaxedOp);
}
-/// WriteNopData - Write optimal nops to the output file for the \arg Count
+/// writeNopData - Write optimal nops to the output file for the \arg Count
/// bytes. This returns the number of bytes written. It may return 0 if
/// the \arg Count is more than the maximum optimal nops.
-bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
static const uint8_t Nops[10][10] = {
// nop
{0x90},
@@ -323,9 +327,9 @@ bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
namespace {
class ELFX86AsmBackend : public X86AsmBackend {
public:
- Triple::OSType OSType;
- ELFX86AsmBackend(const Target &T, Triple::OSType _OSType)
- : X86AsmBackend(T), OSType(_OSType) {
+ uint8_t OSABI;
+ ELFX86AsmBackend(const Target &T, uint8_t _OSABI)
+ : X86AsmBackend(T), OSABI(_OSABI) {
HasReliableSymbolDifference = true;
}
@@ -337,31 +341,21 @@ public:
class ELFX86_32AsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType)
- : ELFX86AsmBackend(T, OSType) {}
+ ELFX86_32AsmBackend(const Target &T, uint8_t OSABI)
+ : ELFX86AsmBackend(T, OSABI) {}
MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
- return createELFObjectWriter(createELFObjectTargetWriter(),
- OS, /*IsLittleEndian*/ true);
- }
-
- MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
- return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false);
+ return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI);
}
};
class ELFX86_64AsmBackend : public ELFX86AsmBackend {
public:
- ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType)
- : ELFX86AsmBackend(T, OSType) {}
+ ELFX86_64AsmBackend(const Target &T, uint8_t OSABI)
+ : ELFX86AsmBackend(T, OSABI) {}
MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
- return createELFObjectWriter(createELFObjectTargetWriter(),
- OS, /*IsLittleEndian*/ true);
- }
-
- MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
- return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true);
+ return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI);
}
};
@@ -375,7 +369,7 @@ public:
}
MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
- return createWinCOFFObjectWriter(OS, Is64Bit);
+ return createX86WinCOFFObjectWriter(OS, Is64Bit);
}
};
@@ -455,7 +449,8 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
if (TheTriple.isOSWindows())
return new WindowsX86AsmBackend(T, false);
- return new ELFX86_32AsmBackend(T, TheTriple.getOS());
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFX86_32AsmBackend(T, OSABI);
}
MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
@@ -467,5 +462,6 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
if (TheTriple.isOSWindows())
return new WindowsX86AsmBackend(T, true);
- return new ELFX86_64AsmBackend(T, TheTriple.getOS());
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFX86_64AsmBackend(T, OSABI);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 662ac1d..a0bb6dc 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -19,7 +19,7 @@
#include "X86MCTargetDesc.h"
#include "llvm/Support/DataTypes.h"
-#include <cassert>
+#include "llvm/Support/ErrorHandling.h"
namespace llvm {
@@ -164,7 +164,13 @@ namespace X86II {
/// is some TLS offset from the picbase.
///
/// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
- MO_TLVP_PIC_BASE
+ MO_TLVP_PIC_BASE,
+
+ /// MO_SECREL - On a symbol operand this indicates that the immediate is
+ /// the offset from beginning of section.
+ ///
+ /// This is the TLS offset for the COFF/Windows TLS mechanism.
+ MO_SECREL
};
enum {
@@ -223,19 +229,13 @@ namespace X86II {
// destinations are the same register.
MRMInitReg = 32,
- //// MRM_C1 - A mod/rm byte of exactly 0xC1.
- MRM_C1 = 33,
- MRM_C2 = 34,
- MRM_C3 = 35,
- MRM_C4 = 36,
- MRM_C8 = 37,
- MRM_C9 = 38,
- MRM_E8 = 39,
- MRM_F0 = 40,
- MRM_F8 = 41,
- MRM_F9 = 42,
- MRM_D0 = 45,
- MRM_D1 = 46,
+ //// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
+ MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
+ MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
+ MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50,
+ MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54,
+ MRM_DF = 55,
/// RawFrmImm8 - This is used for the ENTER instruction, which has two
/// immediates, the first of which is a 16-bit immediate (specified by
@@ -426,10 +426,9 @@ namespace X86II {
/// this flag to indicate that the encoder should do the wacky 3DNow! thing.
Has3DNow0F0FOpcode = 1U << 7,
- /// XOP_W - Same bit as VEX_W. Used to indicate swapping of
- /// operand 3 and 4 to be encoded in ModRM or I8IMM. This is used
- /// for FMA4 and XOP instructions.
- XOP_W = 1U << 8,
+ /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+ /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+ MemOp4 = 1U << 8,
/// XOP - Opcode prefix used by XOP instructions.
XOP = 1U << 9
@@ -451,7 +450,7 @@ namespace X86II {
/// of the specified instruction.
static inline unsigned getSizeOfImm(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
- default: assert(0 && "Unknown immediate size");
+ default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8:
case X86II::Imm8PCRel: return 1;
case X86II::Imm16:
@@ -466,7 +465,7 @@ namespace X86II {
/// TSFlags indicates that it is pc relative.
static inline unsigned isImmPCRel(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
- default: assert(0 && "Unknown immediate size");
+ default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8PCRel:
case X86II::Imm16PCRel:
case X86II::Imm32PCRel:
@@ -489,8 +488,8 @@ namespace X86II {
///
static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
switch (TSFlags & X86II::FormMask) {
- case X86II::MRMInitReg: assert(0 && "FIXME: Remove this form");
- default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
+ case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this form");
+ default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
case X86II::Pseudo:
case X86II::RawFrm:
case X86II::AddRegFrm:
@@ -503,11 +502,11 @@ namespace X86II {
return 0;
case X86II::MRMSrcMem: {
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
- bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
- if (HasXOP_W)
+ if (HasMemOp4)
++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
// FIXME: Maybe lea should have its own form? This is a horrible hack.
@@ -530,18 +529,17 @@ namespace X86II {
++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
return FirstMemOp;
}
- case X86II::MRM_C1:
- case X86II::MRM_C2:
- case X86II::MRM_C3:
- case X86II::MRM_C4:
- case X86II::MRM_C8:
- case X86II::MRM_C9:
- case X86II::MRM_E8:
- case X86II::MRM_F0:
- case X86II::MRM_F8:
- case X86II::MRM_F9:
- case X86II::MRM_D0:
- case X86II::MRM_D1:
+ case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4:
+ case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_E8: case X86II::MRM_F0:
+ case X86II::MRM_F8: case X86II::MRM_F9:
+ case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D4: case X86II::MRM_D8:
+ case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC:
+ case X86II::MRM_DD: case X86II::MRM_DE:
+ case X86II::MRM_DF:
return -1;
}
}
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 0000000..5a42a80
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,224 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+ class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+ X86ELFObjectWriter(bool is64Bit, uint8_t OSABI);
+
+ virtual ~X86ELFObjectWriter();
+ protected:
+ virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsPCRel, bool IsRelocWithSymbol,
+ int64_t Addend) const;
+ };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI,
+ Is64Bit ? ELF::EM_X86_64 : ELF::EM_386,
+ /*HasRelocationAddend*/ Is64Bit) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel,
+ bool IsRelocWithSymbol,
+ int64_t Addend) const {
+ // determine the type of the relocation
+
+ MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+ MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+ unsigned Type;
+ if (is64Bit()) {
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
+ case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
+ case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+
+ case FK_PCRel_8:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC64;
+ break;
+ case X86::reloc_signed_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_riprel_4byte:
+ case FK_PCRel_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_PC32;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_X86_64_PLT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ Type = ELF::R_X86_64_GOTPCREL;
+ break;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_X86_64_GOTTPOFF;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ Type = ELF::R_X86_64_TLSGD;
+ break;
+ case MCSymbolRefExpr::VK_TLSLD:
+ Type = ELF::R_X86_64_TLSLD;
+ break;
+ }
+ break;
+ case FK_PCRel_2:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC16;
+ break;
+ case FK_PCRel_1:
+ assert(Modifier == MCSymbolRefExpr::VK_None);
+ Type = ELF::R_X86_64_PC8;
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+ case FK_Data_8: Type = ELF::R_X86_64_64; break;
+ case X86::reloc_signed_4byte:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_32S;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_X86_64_GOT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTPCREL:
+ Type = ELF::R_X86_64_GOTPCREL;
+ break;
+ case MCSymbolRefExpr::VK_TPOFF:
+ Type = ELF::R_X86_64_TPOFF32;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_X86_64_DTPOFF32;
+ break;
+ }
+ break;
+ case FK_Data_4:
+ Type = ELF::R_X86_64_32;
+ break;
+ case FK_Data_2: Type = ELF::R_X86_64_16; break;
+ case FK_PCRel_1:
+ case FK_Data_1: Type = ELF::R_X86_64_8; break;
+ }
+ }
+ } else {
+ if (IsPCRel) {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case X86::reloc_global_offset_table:
+ Type = ELF::R_386_GOTPC;
+ break;
+
+ case X86::reloc_signed_4byte:
+ case FK_PCRel_4:
+ case FK_Data_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_386_PC32;
+ break;
+ case MCSymbolRefExpr::VK_PLT:
+ Type = ELF::R_386_PLT32;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch ((unsigned)Fixup.getKind()) {
+ default: llvm_unreachable("invalid fixup kind!");
+
+ case X86::reloc_global_offset_table:
+ Type = ELF::R_386_GOTPC;
+ break;
+
+ // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode
+ // instead?
+ case X86::reloc_signed_4byte:
+ case FK_PCRel_4:
+ case FK_Data_4:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_386_32;
+ break;
+ case MCSymbolRefExpr::VK_GOT:
+ Type = ELF::R_386_GOT32;
+ break;
+ case MCSymbolRefExpr::VK_GOTOFF:
+ Type = ELF::R_386_GOTOFF;
+ break;
+ case MCSymbolRefExpr::VK_TLSGD:
+ Type = ELF::R_386_TLS_GD;
+ break;
+ case MCSymbolRefExpr::VK_TPOFF:
+ Type = ELF::R_386_TLS_LE_32;
+ break;
+ case MCSymbolRefExpr::VK_INDNTPOFF:
+ Type = ELF::R_386_TLS_IE;
+ break;
+ case MCSymbolRefExpr::VK_NTPOFF:
+ Type = ELF::R_386_TLS_LE;
+ break;
+ case MCSymbolRefExpr::VK_GOTNTPOFF:
+ Type = ELF::R_386_TLS_GOTIE;
+ break;
+ case MCSymbolRefExpr::VK_TLSLDM:
+ Type = ELF::R_386_TLS_LDM;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_386_TLS_LDO_32;
+ break;
+ case MCSymbolRefExpr::VK_GOTTPOFF:
+ Type = ELF::R_386_TLS_IE_32;
+ break;
+ }
+ break;
+ case FK_Data_2: Type = ELF::R_386_16; break;
+ case FK_PCRel_1:
+ case FK_Data_1: Type = ELF::R_386_8; break;
+ }
+ }
+ }
+
+ return Type;
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
+ bool Is64Bit,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW =
+ new X86ELFObjectWriter(Is64Bit, OSABI);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 17d242a..f2e34cb 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -1,4 +1,4 @@
-//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===//
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index eb64ad1..003a14a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -48,6 +48,8 @@ static const char *const x86_asm_table[] = {
"{cc}", "cc",
0,0};
+void X86MCAsmInfoDarwin::anchor() { }
+
X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
bool is64Bit = T.getArch() == Triple::x86_64;
if (is64Bit)
@@ -80,6 +82,8 @@ X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
: X86MCAsmInfoDarwin(Triple) {
}
+void X86ELFMCAsmInfo::anchor() { }
+
X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
if (T.getArch() == Triple::x86_64)
PointerSize = 8;
@@ -125,6 +129,8 @@ getNonexecutableStackSection(MCContext &Ctx) const {
0, SectionKind::getMetadata());
}
+void X86MCAsmInfoMicrosoft::anchor() { }
+
X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
if (Triple.getArch() == Triple::x86_64) {
GlobalPrefix = "";
@@ -137,6 +143,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
TextAlignFillValue = 0x90;
}
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
if (Triple.getArch() == Triple::x86_64) {
GlobalPrefix = "";
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index 5d619e8..b6b70fd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -1,4 +1,4 @@
-//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====//
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,7 +21,9 @@
namespace llvm {
class Triple;
- struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+ public:
explicit X86MCAsmInfoDarwin(const Triple &Triple);
};
@@ -33,16 +35,22 @@ namespace llvm {
MCStreamer &Streamer) const;
};
- struct X86ELFMCAsmInfo : public MCAsmInfo {
+ class X86ELFMCAsmInfo : public MCAsmInfo {
+ virtual void anchor();
+ public:
explicit X86ELFMCAsmInfo(const Triple &Triple);
virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
};
- struct X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ virtual void anchor();
+ public:
explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
};
- struct X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ virtual void anchor();
+ public:
explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
};
} // namespace llvm
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8e14cb1..37727b6 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===//
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -85,7 +85,7 @@ public:
}
}
- void EmitImmediate(const MCOperand &Disp,
+ void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
unsigned ImmSize, MCFixupKind FixupKind,
unsigned &CurByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
@@ -202,8 +202,8 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) {
}
void X86MCCodeEmitter::
-EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
- unsigned &CurByte, raw_ostream &OS,
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+ MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
const MCExpr *Expr = NULL;
if (DispOp.isImm()) {
@@ -222,6 +222,7 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
// If we have an immoffset, add it to the expression.
if ((FixupKind == FK_Data_4 ||
+ FixupKind == FK_Data_8 ||
FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
if (Kind != GOT_None) {
@@ -230,6 +231,11 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
FixupKind = MCFixupKind(X86::reloc_global_offset_table);
if (Kind == GOT_Normal)
ImmOffset = CurByte;
+ } else if (Expr->getKind() == MCExpr::SymbolRef) {
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ if (Ref->getKind() == MCSymbolRefExpr::VK_SECREL) {
+ FixupKind = MCFixupKind(FK_SecRel_4);
+ }
}
}
@@ -249,7 +255,7 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
Ctx);
// Emit a symbolic constant as a fixup and 4 zeros.
- Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
+ Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc));
EmitConstant(0, Size, CurByte, OS);
}
@@ -285,7 +291,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// expression to emit.
int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
- EmitImmediate(Disp, 4, MCFixupKind(FixupKind),
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
CurByte, OS, Fixups, -ImmSize);
return;
}
@@ -309,7 +315,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
if (BaseReg == 0) { // [disp32] in X86-32 mode
EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
- EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
return;
}
@@ -325,13 +331,13 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
if (Disp.isImm() && isDisp8(Disp.getImm())) {
EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
return;
}
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
Fixups);
return;
}
@@ -390,10 +396,10 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// Do we need to output a displacement?
if (ForceDisp8)
- EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
else if (ForceDisp32 || Disp.getImm() != 0)
- EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
- Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
}
/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
@@ -431,10 +437,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// opcode extension, or ignored, depending on the opcode byte)
unsigned char VEX_W = 0;
- // XOP_W: opcode specific, same bit as VEX_W, but used to
- // swap operand 3 and 4 for FMA4 and XOP instructions
- unsigned char XOP_W = 0;
-
// XOP: Use XOP prefix byte 0x8f instead of VEX.
unsigned char XOP = 0;
@@ -477,9 +479,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
VEX_W = 1;
- if ((TSFlags >> X86II::VEXShift) & X86II::XOP_W)
- XOP_W = 1;
-
if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
XOP = 1;
@@ -487,7 +486,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
VEX_L = 1;
switch (TSFlags & X86II::Op0Mask) {
- default: assert(0 && "Invalid prefix!");
+ default: llvm_unreachable("Invalid prefix!");
case X86II::T8: // 0F 38
VEX_5M = 0x2;
break;
@@ -538,7 +537,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// Classify VEX_B, VEX_4V, VEX_R, VEX_X
unsigned CurOp = 0;
switch (TSFlags & X86II::FormMask) {
- case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+ case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
case X86II::MRMDestMem: {
// MRMDestMem instructions forms:
// MemAddr, src1(ModR/M)
@@ -669,7 +668,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 3 byte VEX prefix
EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
- EmitByte(LastByte | ((VEX_W | XOP_W) << 7), CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
}
/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
@@ -702,7 +701,7 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
}
switch (TSFlags & X86II::FormMask) {
- case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+ case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
case X86II::MRMSrcReg:
if (MI.getOperand(0).isReg() &&
X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
@@ -772,12 +771,12 @@ void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
const MCInst &MI,
raw_ostream &OS) const {
switch (TSFlags & X86II::SegOvrMask) {
- default: assert(0 && "Invalid segment!");
+ default: llvm_unreachable("Invalid segment!");
case 0:
// No segment override, check for explicit one on memory operand.
if (MemOperand != -1) { // If the instruction has a memory operand.
switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
- default: assert(0 && "Unknown segment register!");
+ default: llvm_unreachable("Unknown segment register!");
case 0: break;
case X86::CS: EmitByte(0x2E, CurByte, OS); break;
case X86::SS: EmitByte(0x36, CurByte, OS); break;
@@ -828,7 +827,7 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
bool Need0FPrefix = false;
switch (TSFlags & X86II::Op0Mask) {
- default: assert(0 && "Invalid prefix!");
+ default: llvm_unreachable("Invalid prefix!");
case 0: break; // No prefix!
case X86II::REP: break; // already handled.
case X86II::TB: // Two-byte opcode prefix
@@ -929,8 +928,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
// It uses the VEX.VVVV field?
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
- bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W;
- unsigned XOP_W_I8IMMOperand = 2;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ const unsigned MemOp4_I8IMMOperand = 2;
// Determine where the memory operand starts, if present.
int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
@@ -949,27 +948,29 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned SrcRegNum = 0;
switch (TSFlags & X86II::FormMask) {
case X86II::MRMInitReg:
- assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+ llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
- assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
+ llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
case X86II::Pseudo:
- assert(0 && "Pseudo instruction shouldn't be emitted");
+ llvm_unreachable("Pseudo instruction shouldn't be emitted");
case X86II::RawFrm:
EmitByte(BaseOpcode, CurByte, OS);
break;
case X86II::RawFrmImm8:
EmitByte(BaseOpcode, CurByte, OS);
- EmitImmediate(MI.getOperand(CurOp++),
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
- EmitImmediate(MI.getOperand(CurOp++), 1, FK_Data_1, CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ OS, Fixups);
break;
case X86II::RawFrmImm16:
EmitByte(BaseOpcode, CurByte, OS);
- EmitImmediate(MI.getOperand(CurOp++),
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
- EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups);
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ OS, Fixups);
break;
case X86II::AddRegFrm:
@@ -1003,14 +1004,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
SrcRegNum++;
- if(HasXOP_W) // Skip 2nd src (which is encoded in I8IMM)
+ if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
SrcRegNum++;
EmitRegModRMByte(MI.getOperand(SrcRegNum),
GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
- // 2 operands skipped with HasXOP_W, comensate accordingly
- CurOp = HasXOP_W ? SrcRegNum : SrcRegNum + 1;
+ // 2 operands skipped with HasMemOp4, comensate accordingly
+ CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
if (HasVEX_4VOp3)
++CurOp;
break;
@@ -1022,7 +1023,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
++AddrOperands;
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
}
- if(HasXOP_W) // Skip second register source (encoded in I8IMM)
+ if(HasMemOp4) // Skip second register source (encoded in I8IMM)
++FirstMemOp;
EmitByte(BaseOpcode, CurByte, OS);
@@ -1057,53 +1058,45 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
TSFlags, CurByte, OS, Fixups);
CurOp += X86::AddrNumOperands;
break;
- case X86II::MRM_C1:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC1, CurByte, OS);
- break;
- case X86II::MRM_C2:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC2, CurByte, OS);
- break;
- case X86II::MRM_C3:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC3, CurByte, OS);
- break;
- case X86II::MRM_C4:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC4, CurByte, OS);
- break;
- case X86II::MRM_C8:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC8, CurByte, OS);
- break;
- case X86II::MRM_C9:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC9, CurByte, OS);
- break;
- case X86II::MRM_E8:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xE8, CurByte, OS);
- break;
- case X86II::MRM_F0:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xF0, CurByte, OS);
- break;
- case X86II::MRM_F8:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xF8, CurByte, OS);
- break;
+ case X86II::MRM_C1: case X86II::MRM_C2:
+ case X86II::MRM_C3: case X86II::MRM_C4:
+ case X86II::MRM_C8: case X86II::MRM_C9:
+ case X86II::MRM_D0: case X86II::MRM_D1:
+ case X86II::MRM_D4: case X86II::MRM_D8:
+ case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC:
+ case X86II::MRM_DD: case X86II::MRM_DE:
+ case X86II::MRM_DF: case X86II::MRM_E8:
+ case X86II::MRM_F0: case X86II::MRM_F8:
case X86II::MRM_F9:
EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xF9, CurByte, OS);
- break;
- case X86II::MRM_D0:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xD0, CurByte, OS);
- break;
- case X86II::MRM_D1:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xD1, CurByte, OS);
+
+ unsigned char MRM;
+ switch (TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Invalid Form");
+ case X86II::MRM_C1: MRM = 0xC1; break;
+ case X86II::MRM_C2: MRM = 0xC2; break;
+ case X86II::MRM_C3: MRM = 0xC3; break;
+ case X86II::MRM_C4: MRM = 0xC4; break;
+ case X86II::MRM_C8: MRM = 0xC8; break;
+ case X86II::MRM_C9: MRM = 0xC9; break;
+ case X86II::MRM_D0: MRM = 0xD0; break;
+ case X86II::MRM_D1: MRM = 0xD1; break;
+ case X86II::MRM_D4: MRM = 0xD4; break;
+ case X86II::MRM_D8: MRM = 0xD8; break;
+ case X86II::MRM_D9: MRM = 0xD9; break;
+ case X86II::MRM_DA: MRM = 0xDA; break;
+ case X86II::MRM_DB: MRM = 0xDB; break;
+ case X86II::MRM_DC: MRM = 0xDC; break;
+ case X86II::MRM_DD: MRM = 0xDD; break;
+ case X86II::MRM_DE: MRM = 0xDE; break;
+ case X86II::MRM_DF: MRM = 0xDF; break;
+ case X86II::MRM_E8: MRM = 0xE8; break;
+ case X86II::MRM_F0: MRM = 0xF0; break;
+ case X86II::MRM_F8: MRM = 0xF8; break;
+ case X86II::MRM_F9: MRM = 0xF9; break;
+ }
+ EmitByte(MRM, CurByte, OS);
break;
}
@@ -1113,7 +1106,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
// The last source register of a 4 operand instruction in AVX is encoded
// in bits[7:4] of a immediate byte.
if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
- const MCOperand &MO = MI.getOperand(HasXOP_W ? XOP_W_I8IMMOperand
+ const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
: CurOp);
CurOp++;
bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
@@ -1129,8 +1122,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
RegNum |= Val;
}
}
- EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
- Fixups);
+ EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+ CurByte, OS, Fixups);
} else {
unsigned FixupKind;
// FIXME: Is there a better way to know that we need a signed relocation?
@@ -1141,7 +1134,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
FixupKind = X86::reloc_signed_4byte;
else
FixupKind = getImmFixupKind(TSFlags);
- EmitImmediate(MI.getOperand(CurOp++),
+ EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
CurByte, OS, Fixups);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index f2a34ed..efd18c7 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- X86MCTargetDesc.cpp - X86 Target Descriptions -----------*- C++ -*-===//
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -24,6 +24,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Support/Host.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#define GET_REGINFO_MC_DESC
@@ -35,6 +36,10 @@
#define GET_SUBTARGETINFO_MC_DESC
#include "X86GenSubtargetInfo.inc"
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
using namespace llvm;
@@ -72,6 +77,8 @@ bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
*rECX = registers[2];
*rEDX = registers[3];
return false;
+ #else
+ return true;
#endif
#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
#if defined(__GNUC__)
@@ -98,9 +105,12 @@ bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
mov dword ptr [esi],edx
}
return false;
+ #else
+ return true;
#endif
-#endif
+#else
return true;
+#endif
}
/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
@@ -131,7 +141,11 @@ bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
*rECX = registers[2];
*rEDX = registers[3];
return false;
+ #else
+ return true;
#endif
+ #else
+ return true;
#endif
#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
#if defined(__GNUC__)
@@ -160,9 +174,12 @@ bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
mov dword ptr [esi],edx
}
return false;
+ #else
+ return true;
#endif
-#endif
+#else
return true;
+#endif
}
void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
@@ -319,7 +336,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
std::string CPUName = CPU;
if (CPUName.empty()) {
-#if defined (__x86_64__) || defined(__i386__)
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
CPUName = sys::getHostCPUName();
#else
CPUName = "generic";
@@ -456,11 +474,12 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
static MCInstPrinter *createX86MCInstPrinter(const Target &T,
unsigned SyntaxVariant,
const MCAsmInfo &MAI,
+ const MCRegisterInfo &MRI,
const MCSubtargetInfo &STI) {
if (SyntaxVariant == 0)
- return new X86ATTInstPrinter(MAI);
+ return new X86ATTInstPrinter(MAI, MRI);
if (SyntaxVariant == 1)
- return new X86IntelInstPrinter(MAI);
+ return new X86IntelInstPrinter(MAI, MRI);
return 0;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index a4e0b5a..9896cbe 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -88,6 +88,12 @@ MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
uint32_t CPUType,
uint32_t CPUSubtype);
+/// createX86ELFObjectWriter - Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
+ bool Is64Bit,
+ uint8_t OSABI);
+/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
} // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..bc272ef
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,65 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+ class MCObjectWriter;
+}
+
+namespace {
+ class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+ const bool Is64Bit;
+
+ public:
+ X86WinCOFFObjectWriter(bool Is64Bit_);
+ ~X86WinCOFFObjectWriter();
+
+ virtual unsigned getRelocType(unsigned FixupKind) const;
+ };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
+ : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
+ COFF::IMAGE_FILE_MACHINE_I386),
+ Is64Bit(Is64Bit_) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const {
+ switch (FixupKind) {
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
+ case FK_Data_4:
+ case X86::reloc_signed_4byte:
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
+ case FK_Data_8:
+ if (Is64Bit)
+ return COFF::IMAGE_REL_AMD64_ADDR64;
+ llvm_unreachable("unsupported relocation type");
+ case FK_SecRel_4:
+ return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
+ default:
+ llvm_unreachable("unsupported relocation type");
+ }
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
+ bool Is64Bit) {
+ MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index b407955..f9c1d35 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -56,7 +56,7 @@ cmovs, we should expand to a conditional branch like GCC produces.
Some isel ideas:
-1. Dynamic programming based approach when compile time if not an
+1. Dynamic programming based approach when compile time is not an
issue.
2. Code duplication (addressing mode) during isel.
3. Other ideas from "Register-Sensitive Selection, Duplication, and
@@ -2060,35 +2060,3 @@ Instead we could generate:
The trick is to match "fetch_and_add(X, -C) == C".
//===---------------------------------------------------------------------===//
-
-unsigned log2(unsigned x) {
- return x > 1 ? 32-__builtin_clz(x-1) : 0;
-}
-
-generates (x86_64):
- xorl %eax, %eax
- cmpl $2, %edi
- jb LBB0_2
-## BB#1:
- decl %edi
- movl $63, %ecx
- bsrl %edi, %eax
- cmovel %ecx, %eax
- xorl $-32, %eax
- addl $33, %eax
-LBB0_2:
- ret
-
-The cmov and the early test are redundant:
- xorl %eax, %eax
- cmpl $2, %edi
- jb LBB0_2
-## BB#1:
- decl %edi
- bsrl %edi, %eax
- xorl $-32, %eax
- addl $33, %eax
-LBB0_2:
- ret
-
-//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index e7631b6..f4b85ae 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -63,11 +63,23 @@ void DecodeMOVLHPSMask(unsigned NElts,
ShuffleMask.push_back(NElts+i);
}
-void DecodePSHUFMask(unsigned NElts, unsigned Imm,
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
- for (unsigned i = 0; i != NElts; ++i) {
- ShuffleMask.push_back(Imm % NElts);
- Imm /= NElts;
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ int NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
+ NewImm /= NumLaneElts;
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
}
}
@@ -95,6 +107,9 @@ void DecodePSHUFLWMask(unsigned Imm,
ShuffleMask.push_back(7);
}
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
void DecodeSHUFPMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
@@ -103,22 +118,24 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm,
unsigned NumLaneElts = NumElts / NumLanes;
int NewImm = Imm;
- for (unsigned l = 0; l < NumLanes; ++l) {
- unsigned LaneStart = l * NumLaneElts;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
// Part that reads from dest.
for (unsigned i = 0; i != NumLaneElts/2; ++i) {
- ShuffleMask.push_back(NewImm % NumLaneElts + LaneStart);
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
NewImm /= NumLaneElts;
}
// Part that reads from src.
for (unsigned i = 0; i != NumLaneElts/2; ++i) {
- ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + LaneStart);
+ ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l);
NewImm /= NumLaneElts;
}
if (NumLaneElts == 4) NewImm = Imm; // reload imm
}
}
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
@@ -128,10 +145,8 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
- for (unsigned s = 0; s < NumLanes; ++s) {
- unsigned Start = s * NumLaneElts + NumLaneElts/2;
- unsigned End = s * NumLaneElts + NumLaneElts;
- for (unsigned i = Start; i != End; ++i) {
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
ShuffleMask.push_back(i); // Reads from dest/src1
ShuffleMask.push_back(i+NumElts); // Reads from src/src2
}
@@ -139,8 +154,8 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
}
/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// etc. VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
@@ -150,38 +165,15 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
- for (unsigned s = 0; s < NumLanes; ++s) {
- unsigned Start = s * NumLaneElts;
- unsigned End = s * NumLaneElts + NumLaneElts/2;
- for (unsigned i = Start; i != End; ++i) {
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
ShuffleMask.push_back(i); // Reads from dest/src1
ShuffleMask.push_back(i+NumElts); // Reads from src/src2
}
}
}
-// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit
-// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128
-// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of
-// the first lane must be the same of the second.
-void DecodeVPERMILPMask(EVT VT, unsigned Imm,
- SmallVectorImpl<unsigned> &ShuffleMask) {
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumLaneElts = NumElts / NumLanes;
-
- for (unsigned l = 0; l != NumLanes; ++l) {
- unsigned LaneStart = l*NumLaneElts;
- for (unsigned i = 0; i != NumLaneElts; ++i) {
- unsigned Idx = NumLaneElts == 4 ? (Imm >> (i*2)) & 0x3
- : (Imm >> (i+LaneStart)) & 0x1;
- ShuffleMask.push_back(Idx+LaneStart);
- }
- }
-}
-
-void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned HalfSize = VT.getVectorNumElements()/2;
unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
@@ -193,12 +185,4 @@ void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
ShuffleMask.push_back(i);
}
-void DecodeVPERM2F128Mask(unsigned Imm,
- SmallVectorImpl<unsigned> &ShuffleMask) {
- // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only
- // has information about the instruction and not the types. So for
- // instruction comments purpose, assume the 256-bit vector is v4i64.
- return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask);
-}
-
} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 243728f..877c9bd 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -37,7 +37,7 @@ void DecodeMOVHLPSMask(unsigned NElts,
void DecodeMOVLHPSMask(unsigned NElts,
SmallVectorImpl<unsigned> &ShuffleMask);
-void DecodePSHUFMask(unsigned NElts, unsigned Imm,
+void DecodePSHUFMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
void DecodePSHUFHWMask(unsigned Imm,
@@ -46,30 +46,24 @@ void DecodePSHUFHWMask(unsigned Imm,
void DecodePSHUFLWMask(unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
void DecodeSHUFPMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// etc. VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// etc. VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
-// DecodeVPERMILPMask - Decodes VPERMILPS/ VPERMILPD permutes for any 128-bit
-// 32-bit or 64-bit elements. For 256-bit vectors, it's considered as two 128
-// lanes. For VPERMILPS, referenced elements can't cross lanes and the mask of
-// the first lane must be the same of the second.
-void DecodeVPERMILPMask(EVT VT, unsigned Imm,
- SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodeVPERM2F128Mask(unsigned Imm,
- SmallVectorImpl<unsigned> &ShuffleMask);
-void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
} // llvm namespace
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 8229ca5..b6591d4 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -1,4 +1,4 @@
-//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -55,7 +55,7 @@ def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
[FeatureSSSE3]>;
def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
"Enable SSE 4.2 instructions",
- [FeatureSSE41, FeaturePOPCNT]>;
+ [FeatureSSE41]>;
def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
"Enable 3DNow! instructions",
[FeatureMMX]>;
@@ -78,20 +78,23 @@ def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
"Fast unaligned memory access">;
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
"Support SSE 4a instructions",
- [FeaturePOPCNT]>;
+ [FeatureSSE3]>;
-def FeatureAVX : SubtargetFeature<"avx", "HasAVX", "true",
- "Enable AVX instructions">;
-def FeatureAVX2 : SubtargetFeature<"avx2", "HasAVX2", "true",
+def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+ "Enable AVX instructions",
+ [FeatureSSE42]>;
+def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
def FeatureCLMUL : SubtargetFeature<"clmul", "HasCLMUL", "true",
"Enable carry-less multiplication instructions">;
def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true",
- "Enable three-operand fused multiple-add">;
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
- "Enable four-operand fused multiple-add">;
-def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
+ "Enable four-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
"Enable XOP instructions">;
def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
"HasVectorUAMem", "true",
@@ -112,13 +115,23 @@ def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
"Support BMI instructions">;
def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
"Support BMI2 instructions">;
+def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+ "Use LEA for adjusting the stack pointer">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+ "Intel Atom processors">;
+
class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+ : Processor<Name, GenericItineraries, Features>;
+
+class AtomProc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, AtomItineraries, Features>;
def : Proc<"generic", []>;
def : Proc<"i386", []>;
@@ -143,35 +156,38 @@ def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
-def : Proc<"atom", [FeatureSSE3, FeatureCMPXCHG16B, FeatureMOVBE,
- FeatureSlowBTMem]>;
+def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
+ FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
// "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>;
+ FeatureSlowBTMem, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES]>;
def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem]>;
+ FeatureSlowBTMem, FeatureFastUAMem,
+ FeaturePOPCNT]>;
// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureFastUAMem, FeatureAES,
- FeatureCLMUL]>;
+ FeatureSlowBTMem, FeatureFastUAMem,
+ FeaturePOPCNT, FeatureAES, FeatureCLMUL]>;
// Sandy Bridge
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
// FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B,
+def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
FeatureAES, FeatureCLMUL]>;
// Ivy Bridge
-def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B,
+def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
FeatureAES, FeatureCLMUL,
FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>;
// Haswell
-// FIXME: Disabling AVX/AVX2 for now since it's not ready.
-def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeatureAES,
- FeatureCLMUL, FeatureRDRAND, FeatureF16C,
- FeatureFSGSBase, FeatureFMA3, FeatureMOVBE,
- FeatureLZCNT, FeatureBMI, FeatureBMI2]>;
+// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready.
+def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
+ FeatureAES, FeatureCLMUL, FeatureRDRAND,
+ FeatureF16C, FeatureFSGSBase,
+ FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+ FeatureBMI2]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
@@ -197,15 +213,20 @@ def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A,
Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
- FeatureSlowBTMem]>;
-// FIXME: Disabling AVX for now since it's not ready.
+ FeaturePOPCNT, FeatureSlowBTMem]>;
+// Bobcat
+def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
+ FeatureLZCNT, FeaturePOPCNT]>;
+// FIXME: Disabling AVX/FMA4 for now since it's not ready.
+// Bulldozer
def : Proc<"bdver1", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
- FeatureAES, FeatureCLMUL, FeatureFMA4,
- FeatureXOP, FeatureLZCNT]>;
+ FeatureAES, FeatureCLMUL,
+ FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>;
+// Enhanced Bulldozer
def : Proc<"bdver2", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
- FeatureAES, FeatureCLMUL, FeatureFMA4,
- FeatureXOP, FeatureF16C, FeatureLZCNT,
- FeatureBMI]>;
+ FeatureAES, FeatureCLMUL,
+ FeatureXOP, FeatureF16C, FeatureLZCNT,
+ FeaturePOPCNT, FeatureBMI]>;
def : Proc<"winchip-c6", [FeatureMMX]>;
def : Proc<"winchip2", [Feature3DNow]>;
@@ -237,9 +258,11 @@ include "X86CallingConv.td"
// Assembly Parser
//===----------------------------------------------------------------------===//
-// Currently the X86 assembly parser only supports ATT syntax.
def ATTAsmParser : AsmParser {
- string AsmParserClassName = "ATTAsmParser";
+ string AsmParserClassName = "AsmParser";
+}
+
+def ATTAsmParserVariant : AsmParserVariant {
int Variant = 0;
// Discard comments in assembly strings.
@@ -249,6 +272,16 @@ def ATTAsmParser : AsmParser {
string RegisterPrefix = "%";
}
+def IntelAsmParserVariant : AsmParserVariant {
+ int Variant = 1;
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = ";";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "";
+}
+
//===----------------------------------------------------------------------===//
// Assembly Printers
//===----------------------------------------------------------------------===//
@@ -269,8 +302,7 @@ def IntelAsmWriter : AsmWriter {
def X86 : Target {
// Information about the instructions...
let InstructionSet = X86InstrInfo;
-
let AssemblyParsers = [ATTAsmParser];
-
+ let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
}
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 4c3ff02..268cbf4 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -199,6 +199,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
case X86II::MO_TLVP_PIC_BASE:
O << "@TLVP" << '-' << *MF->getPICBaseSymbol();
break;
+ case X86II::MO_SECREL: O << "@SECREL"; break;
}
}
@@ -266,14 +267,38 @@ void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op,
unsigned char value = MI->getOperand(Op).getImm();
assert(value <= 7 && "Invalid ssecc argument!");
switch (value) {
- case 0: O << "eq"; break;
- case 1: O << "lt"; break;
- case 2: O << "le"; break;
- case 3: O << "unord"; break;
- case 4: O << "neq"; break;
- case 5: O << "nlt"; break;
- case 6: O << "nle"; break;
- case 7: O << "ord"; break;
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ case 8: O << "eq_uq"; break;
+ case 9: O << "nge"; break;
+ case 0xa: O << "ngt"; break;
+ case 0xb: O << "false"; break;
+ case 0xc: O << "neq_oq"; break;
+ case 0xd: O << "ge"; break;
+ case 0xe: O << "gt"; break;
+ case 0xf: O << "true"; break;
+ case 0x10: O << "eq_os"; break;
+ case 0x11: O << "lt_oq"; break;
+ case 0x12: O << "le_oq"; break;
+ case 0x13: O << "unord_s"; break;
+ case 0x14: O << "neq_us"; break;
+ case 0x15: O << "nlt_uq"; break;
+ case 0x16: O << "nle_uq"; break;
+ case 0x17: O << "ord_s"; break;
+ case 0x18: O << "eq_us"; break;
+ case 0x19: O << "nge_uq"; break;
+ case 0x1a: O << "ngt_uq"; break;
+ case 0x1b: O << "false_os"; break;
+ case 0x1c: O << "neq_os"; break;
+ case 0x1d: O << "ge_oq"; break;
+ case 0x1e: O << "gt_oq"; break;
+ case 0x1f: O << "true_us"; break;
}
}
@@ -575,7 +600,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
}
if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() &&
- MMI->callsExternalVAFunctionWithFloatingPointArguments()) {
+ MMI->usesVAFloatArgument()) {
StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
index 4326814..e01ff41 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===//
+//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
index 98ab2a6..63c08f1 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===//
+//===-- X86COFFMachineModuleInfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index aab2a05..d148989 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -1,10 +1,10 @@
-//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===//
-//
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This describes the calling conventions for the X86-32 and X86-64
@@ -61,7 +61,7 @@ def RetCC_X86_32_C : CallingConv<[
// weirdly; this is really the sse-regparm calling convention) in which
// case they use XMM0, otherwise it is the same as the common X86 calling
// conv.
- CCIfInReg<CCIfSubtarget<"hasXMMInt()",
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
CCDelegateTo<RetCC_X86Common>
@@ -73,8 +73,8 @@ def RetCC_X86_32_Fast : CallingConv<[
// SSE2.
// This can happen when a float, 2 x float, or 3 x float vector is split by
// target lowering, and is returned in 1-3 sse regs.
- CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
- CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
// For integers, ECX can be used as an extra return register
CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
@@ -150,12 +150,12 @@ def CC_X86_64_C : CallingConv<[
// The first 8 MMX vector arguments are passed in XMM registers on Darwin.
CCIfType<[x86mmx],
CCIfSubtarget<"isTargetDarwin()",
- CCIfSubtarget<"hasXMMInt()",
+ CCIfSubtarget<"hasSSE2()",
CCPromoteToType<v2i64>>>>,
// The first 8 FP/Vector arguments are passed in XMM registers.
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfSubtarget<"hasXMM()",
+ CCIfSubtarget<"hasSSE1()",
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
// The first 8 256-bit vector arguments are passed in YMM registers, unless
@@ -198,6 +198,10 @@ def CC_X86_Win64_C : CallingConv<[
// 128 bit vectors are passed by pointer
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+ // 256 bit vectors are passed by pointer
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
// The first 4 MMX vector arguments are passed in GPRs.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
@@ -238,7 +242,7 @@ def CC_X86_64_GHC : CallingConv<[
// Pass in STG registers: F1, F2, F3, F4, D1, D2
CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCIfSubtarget<"hasXMM()",
+ CCIfSubtarget<"hasSSE1()",
CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
]>;
@@ -256,7 +260,7 @@ def CC_X86_32_Common : CallingConv<[
// The first 3 float or double arguments, if marked 'inreg' and if the call
// is not a vararg call and if SSE2 is available, are passed in SSE registers.
CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
- CCIfSubtarget<"hasXMMInt()",
+ CCIfSubtarget<"hasSSE2()",
CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
// The first 3 __m64 vector arguments are passed in mmx registers if the
@@ -327,8 +331,8 @@ def CC_X86_32_ThisCall : CallingConv<[
// Promote i8/i16 arguments to i32.
CCIfType<[i8, i16], CCPromoteToType<i32>>,
- // The 'nest' parameter, if any, is passed in EAX.
- CCIfNest<CCAssignToReg<[EAX]>>,
+ // Pass sret arguments indirectly through EAX
+ CCIfSRet<CCAssignToReg<[EAX]>>,
// The first integer argument is passed in ECX
CCIfType<[i32], CCAssignToReg<[ECX]>>,
@@ -355,7 +359,7 @@ def CC_X86_32_FastCC : CallingConv<[
// The first 3 float or double arguments, if the call is not a vararg
// call and if SSE2 is available, are passed in SSE registers.
CCIfNotVarArg<CCIfType<[f32,f64],
- CCIfSubtarget<"hasXMMInt()",
+ CCIfSubtarget<"hasSSE2()",
CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
// Doubles get 8-byte slots that are 8-byte aligned.
@@ -404,3 +408,18 @@ def CC_X86 : CallingConv<[
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
CCDelegateTo<CC_X86_32>
]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_Ghc : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+ (sequence "XMM%u", 6, 15))>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ed16e88..ee3de9a 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -806,8 +806,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
}
assert(MO.isImm() && "Unknown RawFrm operand!");
- if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32 ||
- Opcode == X86::WINCALL64pcrel32) {
+ if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
// Fix up immediate operand for pc relative calls.
intptr_t Imm = (intptr_t)MO.getImm();
Imm = Imm - MCE.getCurrentPCValue() - 4;
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index 4a72d15..c1a49a7 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -60,7 +60,6 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
llvm_unreachable("unknown x86 machine relocation type");
}
}
- return 0;
}
long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
@@ -83,7 +82,6 @@ long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
llvm_unreachable("unknown x86 relocation type");
}
}
- return 0;
}
unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
@@ -107,7 +105,6 @@ unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
llvm_unreachable("unknown x86 relocation type");
}
}
- return 0;
}
bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
@@ -132,7 +129,6 @@ bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
llvm_unreachable("unknown x86 relocation type");
}
}
- return 0;
}
unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
@@ -146,8 +142,6 @@ long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32)
return SymOffset - (RelOffset + 4);
- else
- assert(0 && "computeRelocation unknown for this relocation type");
- return 0;
+ llvm_unreachable("computeRelocation unknown for this relocation type");
}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1589439..f90764e 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -60,8 +60,8 @@ public:
explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
- X86ScalarSSEf64 = Subtarget->hasSSE2() || Subtarget->hasAVX();
- X86ScalarSSEf32 = Subtarget->hasSSE1() || Subtarget->hasAVX();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
}
virtual bool TargetSelectInstruction(const Instruction *I);
@@ -837,8 +837,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
bool HasAVX = Subtarget->hasAVX();
- bool X86ScalarSSEf32 = HasAVX || Subtarget->hasSSE1();
- bool X86ScalarSSEf64 = HasAVX || Subtarget->hasSSE2();
+ bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+ bool X86ScalarSSEf64 = Subtarget->hasSSE2();
switch (VT.getSimpleVT().SimpleTy) {
default: return 0;
@@ -1576,10 +1576,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
SmallVector<unsigned, 8> Args;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
- Args.reserve(CS.arg_size());
- ArgVals.reserve(CS.arg_size());
- ArgVTs.reserve(CS.arg_size());
- ArgFlags.reserve(CS.arg_size());
+ unsigned arg_size = CS.arg_size();
+ Args.reserve(arg_size);
+ ArgVals.reserve(arg_size);
+ ArgVTs.reserve(arg_size);
+ ArgFlags.reserve(arg_size);
for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
i != e; ++i) {
// If we're lowering a mem intrinsic instead of a regular call, skip the
@@ -1792,9 +1793,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (CalleeOp) {
// Register-indirect call.
unsigned CallOpc;
- if (Subtarget->isTargetWin64())
- CallOpc = X86::WINCALL64r;
- else if (Subtarget->is64Bit())
+ if (Subtarget->is64Bit())
CallOpc = X86::CALL64r;
else
CallOpc = X86::CALL32r;
@@ -1805,9 +1804,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Direct call.
assert(GV && "Not a direct call");
unsigned CallOpc;
- if (Subtarget->isTargetWin64())
- CallOpc = X86::WINCALL64pcrel32;
- else if (Subtarget->is64Bit())
+ if (Subtarget->is64Bit())
CallOpc = X86::CALL64pcrel32;
else
CallOpc = X86::CALLpcrel32;
@@ -1852,10 +1849,15 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
MIB.addReg(RegArgs[i]);
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
// Issue CALLSEQ_END
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
unsigned NumBytesCallee = 0;
- if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet))
+ if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() &&
+ CS.paramHasAttr(1, Attribute::StructRet))
NumBytesCallee = 4;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
.addImm(NumBytes).addImm(NumBytesCallee);
@@ -2102,7 +2104,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
if (!X86SelectAddress(C, AM))
return 0;
unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
- TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+ const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
unsigned ResultReg = createResultReg(RC);
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
TII.get(Opc), ResultReg), AM);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index e3461c8..32de194 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1644,6 +1644,30 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
return;
}
+ case X86::WIN_FTOL_32:
+ case X86::WIN_FTOL_64: {
+ // Push the operand into ST0.
+ MachineOperand &Op = MI->getOperand(0);
+ assert(Op.isUse() && Op.isReg() &&
+ Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
+ unsigned FPReg = getFPReg(Op);
+ if (Op.isKill())
+ moveToTop(FPReg, I);
+ else
+ duplicateToTop(FPReg, FPReg, I);
+
+ // Emit the call. This will pop the operand.
+ BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("_ftol2")
+ .addReg(X86::ST0, RegState::ImplicitKill)
+ .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+ .addReg(X86::EDX, RegState::Define | RegState::Implicit)
+ .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+ --StackTop;
+
+ break;
+ }
+
case X86::RET:
case X86::RETI:
// If RET has an FP register use operand, pass the first one in ST(0) and
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 6a40cc1..000e375 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1,4 +1,4 @@
-//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====//
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -79,6 +79,10 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
}
}
+static unsigned getLEArOpcode(unsigned is64Bit) {
+ return is64Bit ? X86::LEA64r : X86::LEA32r;
+}
+
/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
/// when it reaches the "return" instruction. We can then pop a stack object
/// to this register without worry about clobbering it.
@@ -91,11 +95,11 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
if (!F || MF->getMMI().callsEHReturn())
return 0;
- static const unsigned CallerSavedRegs32Bit[] = {
+ static const uint16_t CallerSavedRegs32Bit[] = {
X86::EAX, X86::EDX, X86::ECX, 0
};
- static const unsigned CallerSavedRegs64Bit[] = {
+ static const uint16_t CallerSavedRegs64Bit[] = {
X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
X86::R8, X86::R9, X86::R10, X86::R11, 0
};
@@ -113,7 +117,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
case X86::TCRETURNmi64:
case X86::EH_RETURN:
case X86::EH_RETURN64: {
- SmallSet<unsigned, 8> Uses;
+ SmallSet<uint16_t, 8> Uses;
for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
MachineOperand &MO = MBBI->getOperand(i);
if (!MO.isReg() || MO.isDef())
@@ -121,11 +125,11 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
unsigned Reg = MO.getReg();
if (!Reg)
continue;
- for (const unsigned *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI)
+ for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI)
Uses.insert(*AsI);
}
- const unsigned *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
+ const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
for (; *CS; ++CS)
if (!Uses.count(*CS))
return *CS;
@@ -141,13 +145,18 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
static
void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
unsigned StackPtr, int64_t NumBytes,
- bool Is64Bit, const TargetInstrInfo &TII,
- const TargetRegisterInfo &TRI) {
+ bool Is64Bit, bool UseLEA,
+ const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
bool isSub = NumBytes < 0;
uint64_t Offset = isSub ? -NumBytes : NumBytes;
- unsigned Opc = isSub ?
- getSUBriOpcode(Is64Bit, Offset) :
- getADDriOpcode(Is64Bit, Offset);
+ unsigned Opc;
+ if (UseLEA)
+ Opc = getLEArOpcode(Is64Bit);
+ else
+ Opc = isSub
+ ? getSUBriOpcode(Is64Bit, Offset)
+ : getADDriOpcode(Is64Bit, Offset);
+
uint64_t Chunk = (1LL << 31) - 1;
DebugLoc DL = MBB.findDebugLoc(MBBI);
@@ -171,13 +180,21 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
}
}
- MachineInstr *MI =
- BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr)
- .addImm(ThisVal);
+ MachineInstr *MI = NULL;
+
+ if (UseLEA) {
+ MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ StackPtr, false, isSub ? -ThisVal : ThisVal);
+ } else {
+ MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(ThisVal);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+
if (isSub)
MI->setFlag(MachineInstr::FrameSetup);
- MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
Offset -= ThisVal;
}
}
@@ -191,7 +208,8 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
MachineBasicBlock::iterator PI = prior(MBBI);
unsigned Opc = PI->getOpcode();
if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+ Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
PI->getOperand(0).getReg() == StackPtr) {
if (NumBytes)
*NumBytes += PI->getOperand(2).getImm();
@@ -237,8 +255,8 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
}
/// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB instruction it is deleted argument and the
-/// stack adjustment is returned as a positive value for ADD and a negative for
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
/// SUB.
static int mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
@@ -254,7 +272,8 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
int Offset = 0;
if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+ Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
PI->getOperand(0).getReg() == StackPtr){
Offset += PI->getOperand(2).getImm();
MBB.erase(PI);
@@ -456,21 +475,21 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
// Encode the registers in the order they were saved, 3-bits per register. The
- // registers are numbered from 1 to 6.
+ // registers are numbered from 1 to CU_NUM_SAVED_REGS.
uint32_t RegEnc = 0;
- for (int I = 0; I != 6; --I) {
+ for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
unsigned Reg = SavedRegs[I];
- if (Reg == 0) break;
+ if (Reg == 0) continue;
+
int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
- if (CURegNum == -1)
- return ~0U;
+ if (CURegNum == -1) return ~0U;
// Encode the 3-bit register number in order, skipping over 3-bits for each
// register.
- RegEnc |= (CURegNum & 0x7) << ((5 - I) * 3);
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
}
- assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!");
+ assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
return RegEnc;
}
@@ -626,6 +645,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
bool HasFP = hasFP(MF);
bool Is64Bit = STI.is64Bit();
bool IsWin64 = STI.isTargetWin64();
+ bool UseLEA = STI.useLeaForSP();
unsigned StackAlign = getStackAlignment();
unsigned SlotSize = RegInfo->getSlotSize();
unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -879,7 +899,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// FIXME: %rax preserves the offset and should be available.
if (isSPUpdateNeeded)
emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
- TII, *RegInfo);
+ UseLEA, TII, *RegInfo);
if (isEAXAlive) {
// Restore EAX
@@ -891,7 +911,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
}
} else if (NumBytes)
emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
- TII, *RegInfo);
+ UseLEA, TII, *RegInfo);
if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
// Mark end of stack pointer adjustment.
@@ -935,6 +955,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned RetOpcode = MBBI->getOpcode();
DebugLoc DL = MBBI->getDebugLoc();
bool Is64Bit = STI.is64Bit();
+ bool UseLEA = STI.useLeaForSP();
unsigned StackAlign = getStackAlignment();
unsigned SlotSize = RegInfo->getSlotSize();
unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -1015,7 +1036,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// We cannot use LEA here, because stack pointer was realigned. We need to
// deallocate local frame back.
if (CSSize) {
- emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII,
+ *RegInfo);
MBBI = prior(LastCSPop);
}
@@ -1036,7 +1058,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
} else if (NumBytes) {
// Adjust stack pointer back: ESP += numbytes.
- emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, *RegInfo);
}
// We're returning from function via eh_return.
@@ -1071,7 +1093,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, UseLEA, TII, *RegInfo);
}
// Jump to label or value in register.
@@ -1115,7 +1137,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Check for possible merge with preceding ADD instruction.
delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, UseLEA, TII, *RegInfo);
}
}
@@ -1298,29 +1320,29 @@ HasNestArgument(const MachineFunction *MF) {
return false;
}
+
+/// GetScratchRegister - Get a register for performing work in the segmented
+/// stack prologue. Depending on platform and the properties of the function
+/// either one or two registers will be needed. Set primary to true for
+/// the first register, false for the second.
static unsigned
-GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
- if (Is64Bit) {
- return X86::R11;
- } else {
- CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
- bool IsNested = HasNestArgument(&MF);
-
- if (CallingConvention == CallingConv::X86_FastCall) {
- if (IsNested) {
- report_fatal_error("Segmented stacks does not support fastcall with "
- "nested function.");
- return -1;
- } else {
- return X86::EAX;
- }
- } else {
- if (IsNested)
- return X86::EDX;
- else
- return X86::ECX;
- }
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+ if (Is64Bit)
+ return Primary ? X86::R11 : X86::R12;
+
+ CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+ bool IsNested = HasNestArgument(&MF);
+
+ if (CallingConvention == CallingConv::X86_FastCall ||
+ CallingConvention == CallingConv::Fast) {
+ if (IsNested)
+ report_fatal_error("Segmented stacks does not support fastcall with "
+ "nested function.");
+ return Primary ? X86::EAX : X86::ECX;
}
+ if (IsNested)
+ return Primary ? X86::EDX : X86::EAX;
+ return Primary ? X86::ECX : X86::EAX;
}
// The stack limit in the TCB is set to this many bytes above the actual stack
@@ -1338,14 +1360,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
DebugLoc DL;
const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
- unsigned ScratchReg = GetScratchRegister(Is64Bit, MF);
+ unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
"Scratch register is live-in");
if (MF.getFunction()->isVarArg())
report_fatal_error("Segmented stacks do not support vararg functions.");
- if (!ST->isTargetLinux())
- report_fatal_error("Segmented stacks supported only on linux.");
+ if (!ST->isTargetLinux() && !ST->isTargetDarwin() &&
+ !ST->isTargetWin32() && !ST->isTargetFreeBSD())
+ report_fatal_error("Segmented stacks not supported on this platform.");
MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
@@ -1376,36 +1399,99 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
// prologue.
StackSize = MFI->getStackSize();
+ // When the frame size is less than 256 we just compare the stack
+ // boundary directly to the value of the stack pointer, per gcc.
+ bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
// Read the limit off the current stacklet off the stack_guard location.
if (Is64Bit) {
- TlsReg = X86::FS;
- TlsOffset = 0x70;
+ if (ST->isTargetLinux()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x70;
+ } else if (ST->isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+ } else if (ST->isTargetFreeBSD()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x18;
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
- if (StackSize < kSplitStackAvailable)
+ if (CompareStackPointer)
ScratchReg = X86::RSP;
else
BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
- .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
- .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
} else {
- TlsReg = X86::GS;
- TlsOffset = 0x30;
+ if (ST->isTargetLinux()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x30;
+ } else if (ST->isTargetDarwin()) {
+ TlsReg = X86::GS;
+ TlsOffset = 0x48 + 90*4;
+ } else if (ST->isTargetWin32()) {
+ TlsReg = X86::FS;
+ TlsOffset = 0x14; // pvArbitrary, reserved for application use
+ } else if (ST->isTargetFreeBSD()) {
+ report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+ } else {
+ report_fatal_error("Segmented stacks not supported on this platform.");
+ }
- if (StackSize < kSplitStackAvailable)
+ if (CompareStackPointer)
ScratchReg = X86::ESP;
else
BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
- .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+ .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+ if (ST->isTargetLinux() || ST->isTargetWin32()) {
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ } else if (ST->isTargetDarwin()) {
+
+ // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+ unsigned ScratchReg2;
+ bool SaveScratch2;
+ if (CompareStackPointer) {
+ // The primary scratch register is available for holding the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+ SaveScratch2 = false;
+ } else {
+ // Need to use a second register to hold the TLS offset
+ ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+
+ // Unfortunately, with fastcc the second scratch register may hold an arg
+ SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+ }
- BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
- .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+ // If Scratch2 is live-in then it needs to be saved
+ assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+ "Scratch register is live-in and not saved");
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+ .addReg(ScratchReg2, RegState::Kill);
+
+ BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+ .addImm(TlsOffset);
+ BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+ .addReg(ScratchReg)
+ .addReg(ScratchReg2).addImm(1).addReg(0)
+ .addImm(0)
+ .addReg(TlsReg);
+
+ if (SaveScratch2)
+ BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+ }
}
// This jump is taken if SP >= (Stacklet Limit + Stack Space required).
// It jumps to normal execution of the function body.
- BuildMI(checkMBB, DL, TII.get(X86::JG_4)).addMBB(&prologueMBB);
+ BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB);
// On 32 bit we first push the arguments size and then the frame size. On 64
// bit, we pass the stack frame size in r10 and the argument size in r11.
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 6f49064..d55a497 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -1,4 +1,4 @@
-//=-- X86TargetFrameLowering.h - Define frame lowering for X86 ---*- C++ -*-===//
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3c35763..aa508b8 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -540,7 +540,7 @@ void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
const TargetInstrInfo *TII = TM.getInstrInfo();
if (Subtarget->isTargetCygMing()) {
unsigned CallOp =
- Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32;
+ Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
BuildMI(BB, DebugLoc(),
TII->get(CallOp)).addExternalSymbol("__main");
}
@@ -725,6 +725,213 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
return false;
}
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+ if (N.getNode()->getNodeId() == -1 ||
+ N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+ DAG.RepositionNode(Pos.getNode(), N.getNode());
+ N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ }
+}
+
+// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
+// allows us to convert the shift and and into an h-register extract and
+// a scaled index. Returns false if the simplification is performed.
+static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ !Shift.hasOneUse())
+ return true;
+
+ int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+ if (ScaleLog <= 0 || ScaleLog >= 4 ||
+ Mask != (0xffu << ScaleLog))
+ return true;
+
+ EVT VT = N.getValueType();
+ DebugLoc DL = N.getDebugLoc();
+ SDValue Eight = DAG.getConstant(8, MVT::i8);
+ SDValue NewMask = DAG.getConstant(0xff, VT);
+ SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+ SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, Eight);
+ InsertDAGNode(DAG, N, Srl);
+ InsertDAGNode(DAG, N, NewMask);
+ InsertDAGNode(DAG, N, And);
+ InsertDAGNode(DAG, N, ShlCount);
+ InsertDAGNode(DAG, N, Shl);
+ DAG.ReplaceAllUsesWith(N, Shl);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SHL ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ return true;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+ return true;
+
+ EVT VT = N.getValueType();
+ DebugLoc DL = N.getDebugLoc();
+ SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+ SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, NewMask);
+ InsertDAGNode(DAG, N, NewAnd);
+ InsertDAGNode(DAG, N, NewShift);
+ DAG.ReplaceAllUsesWith(N, NewShift);
+
+ AM.Scale = 1 << ShiftAmt;
+ AM.IndexReg = NewAnd;
+ return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+// int f(short *y, int *lookup_table) {
+// ...
+// return *y + lookup_table[*y >> 11];
+// }
+//
+// Turning into:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $11, %ecx
+// addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+// movzwl (%rdi), %eax
+// movl %eax, %ecx
+// shrl $9, %ecx
+// andl $124, %rcx
+// addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+ uint64_t Mask,
+ SDValue Shift, SDValue X,
+ X86ISelAddressMode &AM) {
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+ !isa<ConstantSDNode>(Shift.getOperand(1)))
+ return true;
+
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ unsigned MaskLZ = CountLeadingZeros_64(Mask);
+ unsigned MaskTZ = CountTrailingZeros_64(Mask);
+
+ // The amount of shift we're trying to fit into the addressing mode is taken
+ // from the trailing zeros of the mask.
+ unsigned AMShiftAmt = MaskTZ;
+
+ // There is nothing we can do here unless the mask is removing some bits.
+ // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+ if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+ // We also need to ensure that mask is a continuous run of bits.
+ if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+ // Scale the leading zero count down based on the actual size of the value.
+ // Also scale it down based on the size of the shift.
+ MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+
+ // The final check is to ensure that any masked out high bits of X are
+ // already known to be zero. Otherwise, the mask has a semantic impact
+ // other than masking out a couple of low bits. Unfortunately, because of
+ // the mask, zero extensions will be removed from operands in some cases.
+ // This code works extra hard to look through extensions because we can
+ // replace them with zero extensions cheaply if necessary.
+ bool ReplacingAnyExtend = false;
+ if (X.getOpcode() == ISD::ANY_EXTEND) {
+ unsigned ExtendBits =
+ X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+ // Assume that we'll replace the any-extend with a zero-extend, and
+ // narrow the search to the extended value.
+ X = X.getOperand(0);
+ MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+ ReplacingAnyExtend = true;
+ }
+ APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
+ MaskLZ);
+ APInt KnownZero, KnownOne;
+ DAG.ComputeMaskedBits(X, MaskedHighBits, KnownZero, KnownOne);
+ if (MaskedHighBits != KnownZero) return true;
+
+ // We've identified a pattern that can be transformed into a single shift
+ // and an addressing mode. Make it so.
+ EVT VT = N.getValueType();
+ if (ReplacingAnyExtend) {
+ assert(X.getValueType() != VT);
+ // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+ SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+ InsertDAGNode(DAG, N, NewX);
+ X = NewX;
+ }
+ DebugLoc DL = N.getDebugLoc();
+ SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
+ SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+ SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
+ SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+ // Insert the new nodes into the topological ordering. We must do this in
+ // a valid topological ordering as nothing is going to go back and re-sort
+ // these nodes. We continually insert before 'N' in sequence as this is
+ // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+ // hierarchy left to express.
+ InsertDAGNode(DAG, N, NewSRLAmt);
+ InsertDAGNode(DAG, N, NewSRL);
+ InsertDAGNode(DAG, N, NewSHLAmt);
+ InsertDAGNode(DAG, N, NewSHL);
+ DAG.ReplaceAllUsesWith(N, NewSHL);
+
+ AM.Scale = 1 << AMShiftAmt;
+ AM.IndexReg = NewSRL;
+ return false;
+}
+
bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
DebugLoc dl = N.getDebugLoc();
@@ -814,6 +1021,33 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
break;
}
+ case ISD::SRL: {
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+ SDValue And = N.getOperand(0);
+ if (And.getOpcode() != ISD::AND) break;
+ SDValue X = And.getOperand(0);
+
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getValueSizeInBits() > 64) break;
+
+ // The mask used for the transform is expected to be post-shift, but we
+ // found the shift first so just apply the shift to the mask before passing
+ // it down.
+ if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+ !isa<ConstantSDNode>(And.getOperand(1)))
+ break;
+ uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+ // Try to fold the mask and shift into the scale, and return false if we
+ // succeed.
+ if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ return false;
+ break;
+ }
+
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
// A mul_lohi where we need the low part can be folded as a plain multiply.
@@ -917,16 +1151,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
AM.Scale = 1;
// Insert the new nodes into the topological ordering.
- if (Zero.getNode()->getNodeId() == -1 ||
- Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
- CurDAG->RepositionNode(N.getNode(), Zero.getNode());
- Zero.getNode()->setNodeId(N.getNode()->getNodeId());
- }
- if (Neg.getNode()->getNodeId() == -1 ||
- Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
- CurDAG->RepositionNode(N.getNode(), Neg.getNode());
- Neg.getNode()->setNodeId(N.getNode()->getNodeId());
- }
+ InsertDAGNode(*CurDAG, N, Zero);
+ InsertDAGNode(*CurDAG, N, Neg);
return false;
}
@@ -981,121 +1207,34 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Perform some heroic transforms on an and of a constant-count shift
// with a constant to enable use of the scaled offset field.
- SDValue Shift = N.getOperand(0);
- if (Shift.getNumOperands() != 2) break;
-
// Scale must not be used already.
if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
SDValue X = Shift.getOperand(0);
- ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
- ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
- if (!C1 || !C2) break;
-
- // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
- // allows us to convert the shift and and into an h-register extract and
- // a scaled index.
- if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
- unsigned ScaleLog = 8 - C1->getZExtValue();
- if (ScaleLog > 0 && ScaleLog < 4 &&
- C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
- SDValue Eight = CurDAG->getConstant(8, MVT::i8);
- SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
- SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
- X, Eight);
- SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
- Srl, Mask);
- SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
- SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
- And, ShlCount);
-
- // Insert the new nodes into the topological ordering.
- if (Eight.getNode()->getNodeId() == -1 ||
- Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
- CurDAG->RepositionNode(X.getNode(), Eight.getNode());
- Eight.getNode()->setNodeId(X.getNode()->getNodeId());
- }
- if (Mask.getNode()->getNodeId() == -1 ||
- Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
- CurDAG->RepositionNode(X.getNode(), Mask.getNode());
- Mask.getNode()->setNodeId(X.getNode()->getNodeId());
- }
- if (Srl.getNode()->getNodeId() == -1 ||
- Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
- CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
- Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
- }
- if (And.getNode()->getNodeId() == -1 ||
- And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
- CurDAG->RepositionNode(N.getNode(), And.getNode());
- And.getNode()->setNodeId(N.getNode()->getNodeId());
- }
- if (ShlCount.getNode()->getNodeId() == -1 ||
- ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
- CurDAG->RepositionNode(X.getNode(), ShlCount.getNode());
- ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
- }
- if (Shl.getNode()->getNodeId() == -1 ||
- Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
- CurDAG->RepositionNode(N.getNode(), Shl.getNode());
- Shl.getNode()->setNodeId(N.getNode()->getNodeId());
- }
- CurDAG->ReplaceAllUsesWith(N, Shl);
- AM.IndexReg = And;
- AM.Scale = (1 << ScaleLog);
- return false;
- }
- }
- // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
- // allows us to fold the shift into this addressing mode.
- if (Shift.getOpcode() != ISD::SHL) break;
+ // We only handle up to 64-bit values here as those are what matter for
+ // addressing mode optimizations.
+ if (X.getValueSizeInBits() > 64) break;
- // Not likely to be profitable if either the AND or SHIFT node has more
- // than one use (unless all uses are for address computation). Besides,
- // isel mechanism requires their node ids to be reused.
- if (!N.hasOneUse() || !Shift.hasOneUse())
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
break;
-
- // Verify that the shift amount is something we can fold.
- unsigned ShiftCst = C1->getZExtValue();
- if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
- break;
-
- // Get the new AND mask, this folds to a constant.
- SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
- SDValue(C2, 0), SDValue(C1, 0));
- SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X,
- NewANDMask);
- SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
- NewAND, SDValue(C1, 0));
+ uint64_t Mask = N.getConstantOperandVal(1);
- // Insert the new nodes into the topological ordering.
- if (C1->getNodeId() > X.getNode()->getNodeId()) {
- CurDAG->RepositionNode(X.getNode(), C1);
- C1->setNodeId(X.getNode()->getNodeId());
- }
- if (NewANDMask.getNode()->getNodeId() == -1 ||
- NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
- CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode());
- NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId());
- }
- if (NewAND.getNode()->getNodeId() == -1 ||
- NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
- CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode());
- NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId());
- }
- if (NewSHIFT.getNode()->getNodeId() == -1 ||
- NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) {
- CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode());
- NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
- }
+ // Try to fold the mask and shift into an extract and scale.
+ if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
- CurDAG->ReplaceAllUsesWith(N, NewSHIFT);
-
- AM.Scale = 1 << ShiftCst;
- AM.IndexReg = NewAND;
- return false;
+ // Try to fold the mask and shift directly into the scale.
+ if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+
+ // Try to swap the mask and shift to place shifts which can be done as
+ // a scale on the outside of the mask.
+ if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+ return false;
+ break;
}
}
@@ -1829,7 +1968,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
getI8Imm(ShlVal));
- break;
}
case X86ISD::UMUL: {
SDValue N0 = Node->getOperand(0);
@@ -2131,7 +2269,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// On x86-32, only the ABCD registers have 8-bit subregisters.
if (!Subtarget->is64Bit()) {
- TargetRegisterClass *TRC = 0;
+ const TargetRegisterClass *TRC;
switch (N0.getValueType().getSimpleVT().SimpleTy) {
case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
@@ -2160,7 +2298,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Reg = N0.getNode()->getOperand(0);
// Put the value in an ABCD register.
- TargetRegisterClass *TRC = 0;
+ const TargetRegisterClass *TRC;
switch (N0.getValueType().getSimpleVT().SimpleTy) {
case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
@@ -2240,6 +2378,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
Chain->getOpcode() != ISD::LOAD ||
StoredVal->getOpcode() != X86ISD::DEC ||
StoredVal.getResNo() != 0 ||
+ !StoredVal.getNode()->hasNUsesOfValue(1, 0) ||
+ !Chain.getNode()->hasNUsesOfValue(1, 0) ||
StoredVal->getOperand(0).getNode() != Chain.getNode())
break;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 03727a2..cae9aad 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13,9 +13,9 @@
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "x86-isel"
+#include "X86ISelLowering.h"
#include "X86.h"
#include "X86InstrBuilder.h"
-#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "Utils/X86ShuffleDecode.h"
@@ -39,20 +39,17 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/VariadicFunction.h"
#include "llvm/Support/CallSite.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
+#include <bitset>
using namespace llvm;
-using namespace dwarf;
STATISTIC(NumTailCalls, "Number of tail calls");
@@ -60,17 +57,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
SDValue V2);
-static SDValue Insert128BitVector(SDValue Result,
- SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
- DebugLoc dl);
-
-static SDValue Extract128BitVector(SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
- DebugLoc dl);
-
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 instruction or a
/// simple subregister reference. Idx is an index in the 128 bits we
@@ -168,8 +154,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
: TargetLowering(TM, createTLOF(TM)) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
- X86ScalarSSEf64 = Subtarget->hasXMMInt();
- X86ScalarSSEf32 = Subtarget->hasXMM();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
RegInfo = TM.getRegisterInfo();
@@ -185,8 +171,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
+ // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
if (Subtarget->is64Bit())
setSchedulingPreference(Sched::ILP);
+ else if (Subtarget->isAtom())
+ setSchedulingPreference(Sched::Hybrid);
else
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -198,15 +187,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setLibcallName(RTLIB::SREM_I64, "_allrem");
setLibcallName(RTLIB::UREM_I64, "_aullrem");
setLibcallName(RTLIB::MUL_I64, "_allmul");
- setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2");
- setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2");
setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
- setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C);
- setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C);
+
+ // The _ftol2 runtime function has an unusual calling conv, which
+ // is modeled by a special pseudo-instruction.
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
}
if (Subtarget->isTargetDarwin()) {
@@ -255,7 +247,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (Subtarget->is64Bit()) {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
} else if (!TM.Options.UseSoftFloat) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
@@ -326,6 +318,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
}
+ if (isTargetFTOL()) {
+ // Use the _ftol2 runtime function, which has a pseudo-instruction
+ // to handle its weird calling convention.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ }
+
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
@@ -378,32 +376,46 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i16 , Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i64 , Expand);
+ // Promote the i8 variants and force them on up to i32 which has a shorter
+ // encoding.
+ setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
if (Subtarget->hasBMI()) {
- setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
} else {
- setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
if (Subtarget->is64Bit())
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
}
- setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i8 , Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i16 , Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i64 , Expand);
if (Subtarget->hasLZCNT()) {
+ // When promoting the i8 variants, force them to i32 for a shorter
+ // encoding.
setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
+ AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
} else {
setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
- if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ }
}
if (Subtarget->hasPOPCNT()) {
@@ -466,7 +478,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
}
- if (Subtarget->hasXMM())
+ if (Subtarget->hasSSE1())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom);
@@ -800,7 +812,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
- if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
@@ -817,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
@@ -923,7 +935,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
}
- if (Subtarget->hasSSE41orAVX()) {
+ if (Subtarget->hasSSE41()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
@@ -959,14 +971,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
// FIXME: these should be Legal but thats only for the case where
- // the index is constant. For now custom expand to deal with that
+ // the index is constant. For now custom expand to deal with that.
if (Subtarget->is64Bit()) {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
}
}
- if (Subtarget->hasXMMInt()) {
+ if (Subtarget->hasSSE2()) {
setOperationAction(ISD::SRL, MVT::v8i16, Custom);
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
@@ -995,7 +1007,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
}
- if (Subtarget->hasSSE42orAVX())
+ if (Subtarget->hasSSE42())
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
@@ -1157,7 +1169,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// of this type with custom code.
for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+ Custom);
}
// We want to custom lower some of our intrinsics.
@@ -1195,7 +1208,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
@@ -1210,6 +1222,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::SINT_TO_FP);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
@@ -1279,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
}
unsigned Align = 4;
- if (Subtarget->hasXMM())
+ if (Subtarget->hasSSE1())
getMaxByValAlign(Ty, Align);
return Align;
}
@@ -1313,17 +1327,20 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16))) &&
Subtarget->getStackAlignment() >= 16) {
- if (Subtarget->hasAVX() &&
- Subtarget->getStackAlignment() >= 32)
- return MVT::v8f32;
- if (Subtarget->hasXMMInt())
+ if (Subtarget->getStackAlignment() >= 32) {
+ if (Subtarget->hasAVX2())
+ return MVT::v8i32;
+ if (Subtarget->hasAVX())
+ return MVT::v8f32;
+ }
+ if (Subtarget->hasSSE2())
return MVT::v4i32;
- if (Subtarget->hasXMM())
+ if (Subtarget->hasSSE1())
return MVT::v4f32;
} else if (!MemcpyStrSrc && Size >= 8 &&
!Subtarget->is64Bit() &&
Subtarget->getStackAlignment() >= 8 &&
- Subtarget->hasXMMInt()) {
+ Subtarget->hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
return MVT::f64;
@@ -1488,14 +1505,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget->is64Bit() && !Subtarget->hasXMM())) {
+ (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
// Likewise we can't return F64 values with SSE1 only. gcc does so, but
// llvm-gcc has never done it right and no one has noticed, so this
// should be OK for now.
if (ValVT == MVT::f64 &&
- (Subtarget->is64Bit() && !Subtarget->hasXMMInt()))
+ (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
@@ -1521,7 +1538,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
- if (!Subtarget->hasXMMInt())
+ if (!Subtarget->hasSSE2())
ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
}
}
@@ -1568,8 +1585,12 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
return false;
SDNode *Copy = *N->use_begin();
- if (Copy->getOpcode() != ISD::CopyToReg &&
- Copy->getOpcode() != ISD::FP_EXTEND)
+ if (Copy->getOpcode() == ISD::CopyToReg) {
+ // If the copy has a glue operand, we conservatively assume it isn't safe to
+ // perform a tail call.
+ if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+ return false;
+ } else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
@@ -1621,7 +1642,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
- ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) {
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
@@ -1711,7 +1732,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
}
bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
- if (!CI->isTailCall())
+ if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
return false;
CallSite CS(CI);
@@ -1790,6 +1811,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MachineFrameInfo *MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget->is64Bit();
+ bool IsWindows = Subtarget->isTargetWindows();
bool IsWin64 = Subtarget->isTargetWin64();
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
@@ -1820,7 +1842,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
- TargetRegisterClass *RC = NULL;
+ const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
RC = X86::GR32RegisterClass;
else if (Is64Bit && RegVT == MVT::i64)
@@ -1928,19 +1950,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
GPR64ArgRegs = GPR64ArgRegs64Bit;
- NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs);
+ NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
+ TotalNumXMMRegs);
}
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
TotalNumIntRegs);
bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
- assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
+ assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
- !Subtarget->hasXMM())
+ !Subtarget->hasSSE1())
// Kernel mode asks for SSE to be disabled, so don't push them
// on the stack.
TotalNumXMMRegs = 0;
@@ -1957,8 +1980,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
// For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so they
- // may be loaded by deferencing the result of va_next.
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
FuncInfo->setRegSaveFrameIndex(
@@ -2024,7 +2047,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
- if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
+ if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ ArgsAreStructReturn(Ins))
FuncInfo->setBytesToPopOnReturn(4);
}
@@ -2099,7 +2123,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
SDValue
X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool isVarArg,
- bool &isTailCall,
+ bool doesNotRet, bool &isTailCall,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -2108,9 +2132,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWindows = Subtarget->isTargetWindows();
bool IsStructRet = CallIsStructReturn(Outs);
bool IsSibcall = false;
+ if (MF.getTarget().Options.DisableTailCalls)
+ isTailCall = false;
+
if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
@@ -2303,7 +2331,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
- assert((Subtarget->hasXMM() || !NumXMMRegs)
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
@@ -2488,6 +2516,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
if (Is64Bit && isVarArg && !IsWin64)
Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
if (InFlag.getNode())
Ops.push_back(InFlag);
@@ -2510,10 +2544,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
getTargetMachine().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPush = NumBytes; // Callee pops everything
- else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
+ else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+ IsStructRet)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
+ // For MSVC Win32 targets, the caller pops the hidden struct pointer.
NumBytesForCalleeToPush = 4;
else
NumBytesForCalleeToPush = 0; // Callee pops nothing.
@@ -2709,9 +2745,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
return false;
}
- // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
- // Therefore if it's not used by the call it is not safe to optimize this into
- // a sibcall.
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87
+ // stack. Therefore, if it's not used by the call it is not safe to optimize
+ // this into a sibcall.
bool Unused = false;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (!Ins[i].Used) {
@@ -2853,9 +2889,8 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
- case X86ISD::SHUFPD:
+ case X86ISD::SHUFP:
case X86ISD::PALIGN:
- case X86ISD::SHUFPS:
case X86ISD::MOVLHPS:
case X86ISD::MOVLHPD:
case X86ISD::MOVHLPS:
@@ -2872,7 +2907,6 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VPERM2X128:
return true;
}
- return false;
}
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2884,8 +2918,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::MOVDDUP:
return DAG.getNode(Opc, dl, VT, V1);
}
-
- return SDValue();
}
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2898,8 +2930,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::VPERMILP:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
-
- return SDValue();
}
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2907,13 +2937,11 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
case X86ISD::PALIGN:
- case X86ISD::SHUFPD:
- case X86ISD::SHUFPS:
+ case X86ISD::SHUFP:
case X86ISD::VPERM2X128:
return DAG.getNode(Opc, dl, VT, V1, V2,
DAG.getConstant(TargetMask, MVT::i8));
}
- return SDValue();
}
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2931,7 +2959,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::UNPCKH:
return DAG.getNode(Opc, dl, VT, V1, V2);
}
- return SDValue();
}
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
@@ -3126,17 +3153,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) || (Val >= Low && Val < Hi);
}
-/// isUndefOrInRange - Return true if every element in Mask, begining
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// range (L, L+Pos]. or is undef.
-static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
- int Pos, int Size, int Low, int Hi) {
- for (int i = Pos, e = Pos+Size; i != e; ++i)
- if (!isUndefOrInRange(Mask[i], Low, Hi))
- return false;
- return true;
-}
-
/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3148,7 +3164,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (L, L+Pos]. or is undef.
-static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
int Pos, int Size, int Low) {
for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
if (!isUndefOrEqual(Mask[i], Low))
@@ -3159,7 +3175,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
/// the second operand.
-static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
if (VT == MVT::v4f32 || VT == MVT::v4i32 )
return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3167,180 +3183,113 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
return false;
}
-bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isPSHUFDMask(M, N->getValueType(0));
-}
-
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
if (VT != MVT::v8i16)
return false;
// Lower quadword copied in order or undef.
- for (int i = 0; i != 4; ++i)
- if (Mask[i] >= 0 && Mask[i] != i)
- return false;
+ if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+ return false;
// Upper quadword shuffled.
- for (int i = 4; i != 8; ++i)
+ for (unsigned i = 4; i != 8; ++i)
if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
return false;
return true;
}
-bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isPSHUFHWMask(M, N->getValueType(0));
-}
-
/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
if (VT != MVT::v8i16)
return false;
// Upper quadword copied in order.
- for (int i = 4; i != 8; ++i)
- if (Mask[i] >= 0 && Mask[i] != i)
- return false;
+ if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+ return false;
// Lower quadword shuffled.
- for (int i = 0; i != 4; ++i)
+ for (unsigned i = 0; i != 4; ++i)
if (Mask[i] >= 4)
return false;
return true;
}
-bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isPSHUFLWMask(M, N->getValueType(0));
-}
-
/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool hasSSSE3OrAVX) {
- int i, e = VT.getVectorNumElements();
- if (VT.getSizeInBits() != 128)
+static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
+ (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
return false;
- // Do not handle v2i64 / v2f64 shuffles with palignr.
- if (e < 4 || !hasSSSE3OrAVX)
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ // Do not handle 64-bit element shuffles with palignr.
+ if (NumLaneElts == 2)
return false;
- for (i = 0; i != e; ++i)
- if (Mask[i] >= 0)
- break;
+ for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+ unsigned i;
+ for (i = 0; i != NumLaneElts; ++i) {
+ if (Mask[i+l] >= 0)
+ break;
+ }
- // All undef, not a palignr.
- if (i == e)
- return false;
+ // Lane is all undef, go to next lane
+ if (i == NumLaneElts)
+ continue;
- // Make sure we're shifting in the right direction.
- if (Mask[i] <= i)
- return false;
+ int Start = Mask[i+l];
- int s = Mask[i] - i;
+ // Make sure its in this lane in one of the sources
+ if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+ !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+ return false;
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != e; ++i) {
- int m = Mask[i];
- if (m >= 0 && m != s+i)
+ // If not lane 0, then we must match lane 0
+ if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
return false;
- }
- return true;
-}
-/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPSY.
-static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool HasAVX, bool Commuted = false) {
- int NumElems = VT.getVectorNumElements();
+ // Correct second source to be contiguous with first source
+ if (Start >= (int)NumElts)
+ Start -= NumElts - NumLaneElts;
- if (!HasAVX || VT.getSizeInBits() != 256)
- return false;
+ // Make sure we're shifting in the right direction.
+ if (Start <= (int)(i+l))
+ return false;
- if (NumElems != 4 && NumElems != 8)
- return false;
+ Start -= i;
- // VSHUFPSY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
- // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
- //
- // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
- // Y3..Y0, Y3..Y0, X3..X0, X3..X0
- //
- // VSHUFPDY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X3 X2 X1 X0
- // SRC2 => Y3 Y2 Y1 Y0
- //
- // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
- //
- unsigned QuarterSize = NumElems/4;
- unsigned HalfSize = QuarterSize*2;
- for (unsigned l = 0; l != 2; ++l) {
- unsigned LaneStart = l*HalfSize;
- for (unsigned s = 0; s != 2; ++s) {
- unsigned QuarterStart = s*QuarterSize;
- unsigned Src = (Commuted) ? (1-s) : s;
- unsigned SrcStart = Src*NumElems + LaneStart;
- for (unsigned i = 0; i != QuarterSize; ++i) {
- int Idx = Mask[i+QuarterStart+LaneStart];
- if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
- return false;
- // For VSHUFPSY, the mask of the second half must be the same as the first
- // but with the appropriate offsets. This works in the same way as
- // VPERMILPS works with masks.
- if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
- continue;
- if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize))
- return false;
- }
- }
- }
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != NumLaneElts; ++i) {
+ int Idx = Mask[i+l];
- return true;
-}
+ // Make sure its in this lane
+ if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+ !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+ return false;
-/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions.
-static unsigned getShuffleVSHUFPYImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ // If not lane 0, then we must match lane 0
+ if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+ return false;
- assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types");
- assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types");
+ if (Idx >= (int)NumElts)
+ Idx -= NumElts - NumLaneElts;
- int HalfSize = NumElems/2;
- unsigned Mul = (NumElems == 8) ? 2 : 1;
- unsigned Mask = 0;
- for (int i = 0; i != NumElems; ++i) {
- int Elt = SVOp->getMaskElt(i);
- if (Elt < 0)
- continue;
- Elt %= HalfSize;
- unsigned Shamt = i;
- // For VSHUFPSY, the mask of the first half must be equal to the second one.
- if (NumElems == 8) Shamt %= HalfSize;
- Mask |= Elt << (Shamt*Mul);
+ if (!isUndefOrEqual(Idx, Start+i))
+ return false;
+
+ }
}
- return Mask;
+ return true;
}
/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
@@ -3359,42 +3308,63 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
}
/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
bool Commuted = false) {
- unsigned NumElems = VT.getVectorNumElements();
-
- if (VT.getSizeInBits() != 128)
+ if (!HasAVX && VT.getSizeInBits() == 256)
return false;
- if (NumElems != 2 && NumElems != 4)
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElems = NumElems/NumLanes;
+
+ if (NumLaneElems != 2 && NumLaneElems != 4)
return false;
- unsigned Half = NumElems / 2;
- unsigned SrcStart = Commuted ? NumElems : 0;
- for (unsigned i = 0; i != Half; ++i)
- if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
- return false;
- SrcStart = Commuted ? 0 : NumElems;
- for (unsigned i = Half; i != NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
- return false;
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
+ // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
+ //
+ // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
+ // Y3..Y0, Y3..Y0, X3..X0, X3..X0
+ //
+ // VSHUFPDY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X3 X2 X1 X0
+ // SRC2 => Y3 Y2 Y1 Y0
+ //
+ // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
+ //
+ unsigned HalfLaneElems = NumLaneElems/2;
+ for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ int Idx = Mask[i+l];
+ unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+ if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+ return false;
+ // For VSHUFPSY, the mask of the second half must be the same as the
+ // first but with the appropriate offsets. This works in the same way as
+ // VPERMILPS works with masks.
+ if (NumElems != 8 || l == 0 || Mask[i] < 0)
+ continue;
+ if (!isUndefOrEqual(Idx, Mask[i]+l))
+ return false;
+ }
+ }
return true;
}
-bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isSHUFPMask(M, N->getValueType(0));
-}
-
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
unsigned NumElems = VT.getVectorNumElements();
if (VT.getSizeInBits() != 128)
@@ -3404,17 +3374,16 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
return false;
// Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
- return isUndefOrEqual(N->getMaskElt(0), 6) &&
- isUndefOrEqual(N->getMaskElt(1), 7) &&
- isUndefOrEqual(N->getMaskElt(2), 2) &&
- isUndefOrEqual(N->getMaskElt(3), 3);
+ return isUndefOrEqual(Mask[0], 6) &&
+ isUndefOrEqual(Mask[1], 7) &&
+ isUndefOrEqual(Mask[2], 2) &&
+ isUndefOrEqual(Mask[3], 3);
}
/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
-bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
unsigned NumElems = VT.getVectorNumElements();
if (VT.getSizeInBits() != 128)
@@ -3423,26 +3392,29 @@ bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
if (NumElems != 4)
return false;
- return isUndefOrEqual(N->getMaskElt(0), 2) &&
- isUndefOrEqual(N->getMaskElt(1), 3) &&
- isUndefOrEqual(N->getMaskElt(2), 2) &&
- isUndefOrEqual(N->getMaskElt(3), 3);
+ return isUndefOrEqual(Mask[0], 2) &&
+ isUndefOrEqual(Mask[1], 3) &&
+ isUndefOrEqual(Mask[2], 2) &&
+ isUndefOrEqual(Mask[3], 3);
}
/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
+static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+ if (VT.getSizeInBits() != 128)
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
if (NumElems != 2 && NumElems != 4)
return false;
- for (unsigned i = 0; i < NumElems/2; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ if (!isUndefOrEqual(Mask[i], i + NumElems))
return false;
- for (unsigned i = NumElems/2; i < NumElems; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), i))
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
return false;
return true;
@@ -3450,19 +3422,19 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
+static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+ unsigned NumElems = VT.getVectorNumElements();
if ((NumElems != 2 && NumElems != 4)
- || N->getValueType(0).getSizeInBits() > 128)
+ || VT.getSizeInBits() > 128)
return false;
- for (unsigned i = 0; i < NumElems/2; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), i))
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
return false;
- for (unsigned i = 0; i < NumElems/2; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
return false;
return true;
@@ -3470,9 +3442,9 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
bool HasAVX2, bool V2IsSplat = false) {
- int NumElts = VT.getVectorNumElements();
+ unsigned NumElts = VT.getVectorNumElements();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
@@ -3486,11 +3458,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- unsigned Start = 0;
- unsigned End = NumLaneElts;
- for (unsigned s = 0; s < NumLanes; ++s) {
- for (unsigned i = Start, j = s * NumLaneElts;
- i != End;
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
@@ -3504,25 +3474,16 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
return false;
}
}
- // Process the next 128 bits.
- Start += NumLaneElts;
- End += NumLaneElts;
}
return true;
}
-bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
-}
-
/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
bool HasAVX2, bool V2IsSplat = false) {
- int NumElts = VT.getVectorNumElements();
+ unsigned NumElts = VT.getVectorNumElements();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
@@ -3536,11 +3497,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- unsigned Start = 0;
- unsigned End = NumLaneElts;
for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
- i != End; i += 2, ++j) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
@@ -3553,42 +3512,39 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
return false;
}
}
- // Process the next 128 bits.
- Start += NumLaneElts;
- End += NumLaneElts;
}
return true;
}
-bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
-}
-
/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
- if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
+ bool HasAVX2) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
// For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
// FIXME: Need a better way to get rid of this, there's no latency difference
// between UNPCKLPD and MOVDDUP, the later should always be checked first and
// the former later. We should also remove the "_undef" special mask.
- if (NumElems == 4 && VT.getSizeInBits() == 256)
+ if (NumElts == 4 && VT.getSizeInBits() == 256)
return false;
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumLaneElts = NumElems / NumLanes;
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned s = 0; s < NumLanes; ++s) {
- for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
- i != NumLaneElts * (s + 1);
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+ i != (l+1)*NumLaneElts;
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
@@ -3603,81 +3559,77 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
return true;
}
-bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
-}
-
/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
/// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
- int NumElems = VT.getVectorNumElements();
- if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
return false;
- for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != (l+1)*NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
}
return true;
}
-bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
-}
-
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
if (VT.getVectorElementType().getSizeInBits() < 32)
return false;
+ if (VT.getSizeInBits() == 256)
+ return false;
- int NumElts = VT.getVectorNumElements();
+ unsigned NumElts = VT.getVectorNumElements();
if (!isUndefOrEqual(Mask[0], NumElts))
return false;
- for (int i = 1; i < NumElts; ++i)
+ for (unsigned i = 1; i != NumElts; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
return true;
}
-bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isMOVLMask(M, N->getValueType(0));
-}
-
/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
/// as permutations between 128-bit chunks or halves. As an example: this
/// shuffle bellow:
/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
/// The first half comes from the second half of V1 and the second half from the
/// the second half of V2.
-static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool HasAVX) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
if (!HasAVX || VT.getSizeInBits() != 256)
return false;
// The shuffle result is divided into half A and half B. In total the two
// sources have 4 halves, namely: C, D, E, F. The final values of A and
// B must come from C, D, E or F.
- int HalfSize = VT.getVectorNumElements()/2;
+ unsigned HalfSize = VT.getVectorNumElements()/2;
bool MatchA = false, MatchB = false;
// Check if A comes from one of C, D, E, F.
- for (int Half = 0; Half < 4; ++Half) {
+ for (unsigned Half = 0; Half != 4; ++Half) {
if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
MatchA = true;
break;
@@ -3685,7 +3637,7 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
}
// Check if B comes from one of C, D, E, F.
- for (int Half = 0; Half < 4; ++Half) {
+ for (unsigned Half = 0; Half != 4; ++Half) {
if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
MatchB = true;
break;
@@ -3700,16 +3652,16 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
- int HalfSize = VT.getVectorNumElements()/2;
+ unsigned HalfSize = VT.getVectorNumElements()/2;
- int FstHalf = 0, SndHalf = 0;
- for (int i = 0; i < HalfSize; ++i) {
+ unsigned FstHalf = 0, SndHalf = 0;
+ for (unsigned i = 0; i < HalfSize; ++i) {
if (SVOp->getMaskElt(i) > 0) {
FstHalf = SVOp->getMaskElt(i)/HalfSize;
break;
}
}
- for (int i = HalfSize; i < HalfSize*2; ++i) {
+ for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
if (SVOp->getMaskElt(i) > 0) {
SndHalf = SVOp->getMaskElt(i)/HalfSize;
break;
@@ -3725,31 +3677,28 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool HasAVX) {
- int NumElts = VT.getVectorNumElements();
- int NumLanes = VT.getSizeInBits()/128;
-
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
if (!HasAVX)
return false;
+ unsigned NumElts = VT.getVectorNumElements();
// Only match 256-bit with 32/64-bit types
if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
return false;
- int LaneSize = NumElts/NumLanes;
- for (int l = 0; l != NumLanes; ++l) {
- int LaneStart = l*LaneSize;
- for (int i = 0; i != LaneSize; ++i) {
- if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned LaneSize = NumElts/NumLanes;
+ for (unsigned l = 0; l != NumElts; l += LaneSize) {
+ for (unsigned i = 0; i != LaneSize; ++i) {
+ if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
return false;
- if (NumElts == 4 || l == 0)
+ if (NumElts != 8 || l == 0)
continue;
// VPERMILPS handling
if (Mask[i] < 0)
continue;
- if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize))
+ if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
return false;
}
}
@@ -3757,48 +3706,19 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
return true;
}
-/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
-static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
-
- int NumElts = VT.getVectorNumElements();
- int NumLanes = VT.getSizeInBits()/128;
- int LaneSize = NumElts/NumLanes;
-
- // Although the mask is equal for both lanes do it twice to get the cases
- // where a mask will match because the same mask element is undef on the
- // first half but valid on the second. This would get pathological cases
- // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
- unsigned Shift = (LaneSize == 4) ? 2 : 1;
- unsigned Mask = 0;
- for (int i = 0; i != NumElts; ++i) {
- int MaskElt = SVOp->getMaskElt(i);
- if (MaskElt < 0)
- continue;
- MaskElt %= LaneSize;
- unsigned Shamt = i;
- // VPERMILPSY, the mask of the first half must be equal to the second one
- if (NumElts == 8) Shamt %= LaneSize;
- Mask |= MaskElt << (Shamt*Shift);
- }
-
- return Mask;
-}
-
-/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
bool V2IsSplat = false, bool V2IsUndef = false) {
- int NumOps = VT.getVectorNumElements();
+ unsigned NumOps = VT.getVectorNumElements();
if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
return false;
if (!isUndefOrEqual(Mask[0], 0))
return false;
- for (int i = 1; i < NumOps; ++i)
+ for (unsigned i = 1; i != NumOps; ++i)
if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
(V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
(V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
@@ -3807,26 +3727,14 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
return true;
}
-static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
- bool V2IsUndef = false) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
-}
-
/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3orAVX())
- return false;
-
- // The second vector must be undef
- if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3())
return false;
- EVT VT = N->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
@@ -3834,9 +3742,9 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
return false;
// "i+1" is the value the indexed mask element must have
- for (unsigned i = 0; i < NumElems; i += 2)
- if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
- !isUndefOrEqual(N->getMaskElt(i+1), i+1))
+ for (unsigned i = 0; i != NumElems; i += 2)
+ if (!isUndefOrEqual(Mask[i], i+1) ||
+ !isUndefOrEqual(Mask[i+1], i+1))
return false;
return true;
@@ -3845,16 +3753,11 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3orAVX())
- return false;
-
- // The second vector must be undef
- if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3())
return false;
- EVT VT = N->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
@@ -3862,9 +3765,9 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
return false;
// "i" is the value the indexed mask element must have
- for (unsigned i = 0; i < NumElems; i += 2)
- if (!isUndefOrEqual(N->getMaskElt(i), i) ||
- !isUndefOrEqual(N->getMaskElt(i+1), i))
+ for (unsigned i = 0; i != NumElems; i += 2)
+ if (!isUndefOrEqual(Mask[i], i) ||
+ !isUndefOrEqual(Mask[i+1], i))
return false;
return true;
@@ -3873,17 +3776,16 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 256-bit
/// version of MOVDDUP.
-static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool HasAVX) {
- int NumElts = VT.getVectorNumElements();
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
+ unsigned NumElts = VT.getVectorNumElements();
if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
return false;
- for (int i = 0; i != NumElts/2; ++i)
+ for (unsigned i = 0; i != NumElts/2; ++i)
if (!isUndefOrEqual(Mask[i], 0))
return false;
- for (int i = NumElts/2; i != NumElts; ++i)
+ for (unsigned i = NumElts/2; i != NumElts; ++i)
if (!isUndefOrEqual(Mask[i], NumElts/2))
return false;
return true;
@@ -3892,18 +3794,16 @@ static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to 128-bit
/// version of MOVDDUP.
-bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
-
+static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
if (VT.getSizeInBits() != 128)
return false;
- int e = VT.getVectorNumElements() / 2;
- for (int i = 0; i < e; ++i)
- if (!isUndefOrEqual(N->getMaskElt(i), i))
+ unsigned e = VT.getVectorNumElements() / 2;
+ for (unsigned i = 0; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
return false;
- for (int i = 0; i < e; ++i)
- if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+ for (unsigned i = 0; i != e; ++i)
+ if (!isUndefOrEqual(Mask[e+i], i))
return false;
return true;
}
@@ -3948,31 +3848,43 @@ bool X86::isVINSERTF128Index(SDNode *N) {
/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+/// Handles 128-bit and 256-bit.
+static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for PSHUF/SHUFP");
- unsigned Shift = (NumOperands == 4) ? 2 : 1;
+ // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+ // independently on 128-bit lanes.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ assert((NumLaneElts == 2 || NumLaneElts == 4) &&
+ "Only supports 2 or 4 elements per lane");
+
+ unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
unsigned Mask = 0;
- for (int i = 0; i < NumOperands; ++i) {
- int Val = SVOp->getMaskElt(NumOperands-i-1);
- if (Val < 0) Val = 0;
- if (Val >= NumOperands) Val -= NumOperands;
- Mask |= Val;
- if (i != NumOperands - 1)
- Mask <<= Shift;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt < 0) continue;
+ Elt %= NumLaneElts;
+ unsigned ShAmt = i << Shift;
+ if (ShAmt >= 8) ShAmt -= 8;
+ Mask |= Elt << ShAmt;
}
+
return Mask;
}
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
-unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
unsigned Mask = 0;
// 8 nodes, but we only care about the last 4.
for (unsigned i = 7; i >= 4; --i) {
- int Val = SVOp->getMaskElt(i);
+ int Val = N->getMaskElt(i);
if (Val >= 0)
Mask |= (Val - 4);
if (i != 4)
@@ -3983,12 +3895,11 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
-unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
unsigned Mask = 0;
// 8 nodes, but we only care about the first 4.
for (int i = 3; i >= 0; --i) {
- int Val = SVOp->getMaskElt(i);
+ int Val = N->getMaskElt(i);
if (Val >= 0)
Mask |= Val;
if (i != 0)
@@ -4002,14 +3913,21 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
- int Val = 0;
- unsigned i, e;
- for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ int Val = 0;
+ unsigned i;
+ for (i = 0; i != NumElts; ++i) {
Val = SVOp->getMaskElt(i);
if (Val >= 0)
break;
}
+ if (Val >= (int)NumElts)
+ Val -= NumElts - NumLaneElts;
+
assert(Val - i > 0 && "PALIGNR imm should be positive");
return (Val - i) * EltSize;
}
@@ -4082,17 +4000,16 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
/// match movhlps. The lower half elements should come from upper half of
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
- EVT VT = Op->getValueType(0);
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
if (VT.getSizeInBits() != 128)
return false;
if (VT.getVectorNumElements() != 4)
return false;
for (unsigned i = 0, e = 2; i != e; ++i)
- if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+ if (!isUndefOrEqual(Mask[i], i+2))
return false;
for (unsigned i = 2; i != 4; ++i)
- if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+ if (!isUndefOrEqual(Mask[i], i+4))
return false;
return true;
}
@@ -4140,8 +4057,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
/// half of V2 (and in order). And since V1 will become the source of the
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
- ShuffleVectorSDNode *Op) {
- EVT VT = Op->getValueType(0);
+ ArrayRef<int> Mask, EVT VT) {
if (VT.getSizeInBits() != 128)
return false;
@@ -4157,10 +4073,10 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
if (NumElems != 2 && NumElems != 4)
return false;
for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Op->getMaskElt(i), i))
+ if (!isUndefOrEqual(Mask[i], i))
return false;
for (unsigned i = NumElems/2; i != NumElems; ++i)
- if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+ if (!isUndefOrEqual(Mask[i], i+NumElems))
return false;
return true;
}
@@ -4208,15 +4124,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
-static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
- DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
// Always build SSE zero vectors as <4 x i32> bitcasted
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
if (VT.getSizeInBits() == 128) { // SSE
- if (HasXMMInt) { // SSE2
+ if (Subtarget->hasSSE2()) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
@@ -4224,12 +4140,17 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
} else if (VT.getSizeInBits() == 256) { // AVX
- // 256-bit logic and arithmetic instructions in AVX are
- // all floating-point, no support for integer ops. Default
- // to emitting fp zeroed vectors then.
- SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+ if (Subtarget->hasAVX2()) { // AVX2
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+ } else {
+ // 256-bit logic and arithmetic instructions in AVX are all
+ // floating-point, no support for integer ops. Emit fp zeroed vectors.
+ SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+ }
}
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -4266,24 +4187,12 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
/// that point to V2 points to its first element.
-static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- bool Changed = false;
- SmallVector<int, 8> MaskVec;
- SVOp->getMask(MaskVec);
-
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
for (unsigned i = 0; i != NumElems; ++i) {
- if (MaskVec[i] > (int)NumElems) {
- MaskVec[i] = NumElems;
- Changed = true;
+ if (Mask[i] > (int)NumElems) {
+ Mask[i] = NumElems;
}
}
- if (Changed)
- return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
- SVOp->getOperand(1), &MaskVec[0]);
- return SDValue(SVOp, 0);
}
/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4387,7 +4296,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
if (Size == 256) {
- unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+ unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
if (Idx > 0)
EltNo -= NumElems/2;
@@ -4419,11 +4328,12 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
- bool isZero, bool HasXMMInt,
+ bool IsZero,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
EVT VT = V2.getValueType();
- SDValue V1 = isZero
- ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ SDValue V1 = IsZero
+ ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec;
for (unsigned i = 0; i != NumElems; ++i)
@@ -4450,20 +4360,20 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
if (Index < 0)
return DAG.getUNDEF(VT.getVectorElementType());
- int NumElems = VT.getVectorNumElements();
- SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+ unsigned NumElems = VT.getVectorNumElements();
+ SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0)
+ : SV->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
}
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
SmallVector<unsigned, 16> ShuffleMask;
SDValue ImmN;
switch(Opcode) {
- case X86ISD::SHUFPS:
- case X86ISD::SHUFPD:
+ case X86ISD::SHUFP:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
@@ -4481,9 +4391,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
DecodeMOVLHPSMask(NumElems, ShuffleMask);
break;
case X86ISD::PSHUFD:
+ case X86ISD::VPERMILP:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFMask(NumElems,
- cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
break;
case X86ISD::PSHUFHW:
@@ -4505,14 +4415,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
- case X86ISD::VPERMILP:
- ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- ShuffleMask);
- break;
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
break;
case X86ISD::MOVDDUP:
@@ -4523,16 +4428,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
case X86ISD::MOVSLDUP:
case X86ISD::PALIGN:
return SDValue(); // Not yet implemented.
- default:
- assert(0 && "unknown target shuffle node");
- return SDValue();
+ default: llvm_unreachable("unknown target shuffle node");
}
Index = ShuffleMask[Index];
if (Index < 0)
return DAG.getUNDEF(VT.getVectorElementType());
- SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+ SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0)
+ : N->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
Depth+1);
}
@@ -4693,6 +4597,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
const TargetLowering &TLI) {
if (NumNonZero > 8)
return SDValue();
@@ -4704,7 +4609,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
if (NumZero)
- V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else
V = DAG.getUNDEF(MVT::v8i16);
First = false;
@@ -4740,6 +4645,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
+ const X86Subtarget* Subtarget,
const TargetLowering &TLI) {
if (NumNonZero > 4)
return SDValue();
@@ -4752,7 +4658,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
if (isNonZero) {
if (First) {
if (NumZero)
- V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else
V = DAG.getUNDEF(MVT::v8i16);
First = false;
@@ -4773,7 +4679,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
const TargetLowering &TLI, DebugLoc dl) {
assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
EVT ShVT = MVT::v2i64;
- unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+ unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(Opc, dl, ShVT, SrcOp,
@@ -4841,21 +4747,16 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
int EltNo = (Offset - StartOffset) >> 2;
int NumElems = VT.getVectorNumElements();
- EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- // Canonicalize it to a v4i32 or v8i32 shuffle.
SmallVector<int, 8> Mask;
for (int i = 0; i < NumElems; ++i)
Mask.push_back(EltNo);
- V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
- return DAG.getNode(ISD::BITCAST, dl, NVT,
- DAG.getVectorShuffle(CanonVT, dl, V1,
- DAG.getUNDEF(CanonVT),&Mask[0]));
+ return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
return SDValue();
@@ -4938,7 +4839,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
/// a scalar load.
/// The scalar load node is returned when a pattern is found,
/// or SDValue() otherwise.
-static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
+static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
EVT VT = Op.getValueType();
SDValue V = Op;
@@ -4993,22 +4897,14 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
if (!ISD::isNormalLoad(Ld.getNode()))
return SDValue();
+ // Reject loads that have uses of the chain result
+ if (Ld->hasAnyUseOfValue(1))
+ return SDValue();
+
bool Is256 = VT.getSizeInBits() == 256;
bool Is128 = VT.getSizeInBits() == 128;
unsigned ScalarSize = Ld.getValueType().getSizeInBits();
- if (hasAVX2) {
- // VBroadcast to YMM
- if (Is256 && (ScalarSize == 8 || ScalarSize == 16 ||
- ScalarSize == 32 || ScalarSize == 64 ))
- return Ld;
-
- // VBroadcast to XMM
- if (Is128 && (ScalarSize == 8 || ScalarSize == 32 ||
- ScalarSize == 16 || ScalarSize == 64 ))
- return Ld;
- }
-
// VBroadcast to YMM
if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
return Ld;
@@ -5017,6 +4913,17 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
if (Is128 && (ScalarSize == 32))
return Ld;
+ // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+ // double since there is vbroadcastsd xmm
+ if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
+ // VBroadcast to YMM
+ if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
+ return Ld;
+
+ // VBroadcast to XMM
+ if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64))
+ return Ld;
+ }
// Unsupported broadcast.
return SDValue();
@@ -5034,27 +4941,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
- if (Op.getValueType() == MVT::v4i32 ||
- Op.getValueType() == MVT::v8i32)
+ if (VT == MVT::v4i32 || VT == MVT::v8i32)
return Op;
- return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
+ return getZeroVector(VT, Subtarget, DAG, dl);
}
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
- if (Op.getValueType() == MVT::v4i32 ||
- (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2()))
+ if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
return Op;
- return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl);
+ return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
}
- SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
- if (Subtarget->hasAVX() && LD.getNode())
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+ SDValue LD = isVectorBroadcast(Op, Subtarget);
+ if (LD.getNode())
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -5105,8 +5010,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// convert it to a vector with movd (S2V+shuffle to zero extend).
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasXMMInt(), DAG);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
@@ -5119,7 +5023,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DAG.getUNDEF(Item.getValueType()),
&Mask[0]);
}
- return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Item);
}
}
@@ -5128,23 +5032,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
- if (NumZero == 0) {
+ if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
- } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+
+ if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+ if (VT.getSizeInBits() == 256) {
+ SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+ Item, DAG.getIntPtrConstant(0));
+ }
+ assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
- return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
- DAG);
- } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
+
+ if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- unsigned NumBits = VT.getSizeInBits();
- assert((NumBits == 128 || NumBits == 256) &&
- "Expected an SSE or AVX value type!");
- EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32;
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasXMMInt(), DAG);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ if (VT.getSizeInBits() == 256) {
+ SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
+ DAG, dl);
+ } else {
+ assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ }
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
}
}
@@ -5172,8 +5086,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a shuffle of zero and zero-extended scalar to vector.
- Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
- Subtarget->hasXMMInt(), DAG);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(i == Idx ? 0 : 1);
@@ -5203,9 +5116,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
- if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+ if (VT.getSizeInBits() == 256) {
SmallVector<SDValue, 32> V;
- for (unsigned i = 0; i < NumElems; ++i)
+ for (unsigned i = 0; i != NumElems; ++i)
V.push_back(Op.getOperand(i));
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
@@ -5229,8 +5142,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
- return getShuffleVectorZeroOrUndef(V2, Idx, true,
- Subtarget->hasXMMInt(), DAG);
+ return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
}
return SDValue();
}
@@ -5238,24 +5150,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16) {
SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
- *this);
+ Subtarget, *this);
if (V.getNode()) return V;
}
if (EVTBits == 16 && NumElems == 8) {
SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
- *this);
+ Subtarget, *this);
if (V.getNode()) return V;
}
// If element VT is == 32 bits, turn it into a number of shuffles.
- SmallVector<SDValue, 8> V;
- V.resize(NumElems);
+ SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
- V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+ V[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
@@ -5278,13 +5189,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
}
- SmallVector<int, 8> MaskVec;
- bool Reverse = (NonZeros & 0x3) == 2;
- for (unsigned i = 0; i < 2; ++i)
- MaskVec.push_back(Reverse ? 1-i : i);
- Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
- for (unsigned i = 0; i < 2; ++i)
- MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+ bool Reverse1 = (NonZeros & 0x3) == 2;
+ bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ int MaskVec[] = {
+ Reverse1 ? 1 : 0,
+ Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems+1)
+ };
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
}
@@ -5299,7 +5211,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41orAVX()) {
+ if (getSubtarget()->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -5430,7 +5342,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// mask values count as coming from any quadword, for better codegen.
unsigned LoQuad[] = { 0, 0, 0, 0 };
unsigned HiQuad[] = { 0, 0, 0, 0 };
- BitVector InputQuads(4);
+ std::bitset<4> InputQuads;
for (unsigned i = 0; i < 8; ++i) {
unsigned *Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = SVOp->getMaskElt(i);
@@ -5470,10 +5382,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3orAVX()) {
+ if (Subtarget->hasSSSE3()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
- BestLoQuad = InputQuads.find_first();
- BestHiQuad = InputQuads.find_next(BestLoQuad);
+ BestLoQuad = InputQuads[0] ? 0 : 1;
+ BestHiQuad = InputQuads[2] ? 2 : 3;
}
if (InputQuads.count() > 2) {
BestLoQuad = -1;
@@ -5486,9 +5398,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// words from all 4 input quadwords.
SDValue NewV;
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
- SmallVector<int, 8> MaskV;
- MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
- MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+ int MaskV[] = {
+ BestLoQuad < 0 ? 0 : BestLoQuad,
+ BestHiQuad < 0 ? 1 : BestHiQuad
+ };
NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
@@ -5533,8 +5446,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
unsigned TargetMask = 0;
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
- TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
- X86::getShufflePSHUFLWImmediate(NewV.getNode());
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+ TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+ getShufflePSHUFLWImmediate(SVOp);
V1 = NewV.getOperand(0);
return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
}
@@ -5543,7 +5457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
- if (Subtarget->hasSSSE3orAVX()) {
+ if (Subtarget->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
@@ -5591,59 +5505,51 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
// and update MaskVals with new element order.
- BitVector InOrder(8);
+ std::bitset<8> InOrder;
if (BestLoQuad >= 0) {
- SmallVector<int, 8> MaskV;
+ int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
for (int i = 0; i != 4; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
- MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestLoQuad) {
- MaskV.push_back(idx & 3);
+ MaskV[i] = idx & 3;
InOrder.set(i);
- } else {
- MaskV.push_back(-1);
}
}
- for (unsigned i = 4; i != 8; ++i)
- MaskV.push_back(i);
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
- NewV.getOperand(0),
- X86::getShufflePSHUFLWImmediate(NewV.getNode()),
- DAG);
+ NewV.getOperand(0),
+ getShufflePSHUFLWImmediate(SVOp), DAG);
+ }
}
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
// and update MaskVals with the new element order.
if (BestHiQuad >= 0) {
- SmallVector<int, 8> MaskV;
- for (unsigned i = 0; i != 4; ++i)
- MaskV.push_back(i);
+ int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
for (unsigned i = 4; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
- MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestHiQuad) {
- MaskV.push_back((idx & 3) + 4);
+ MaskV[i] = (idx & 3) + 4;
InOrder.set(i);
- } else {
- MaskV.push_back(-1);
}
}
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
- NewV.getOperand(0),
- X86::getShufflePSHUFHWImmediate(NewV.getNode()),
- DAG);
+ NewV.getOperand(0),
+ getShufflePSHUFHWImmediate(SVOp), DAG);
+ }
}
// In case BestHi & BestLo were both -1, which means each quadword has a word
@@ -5685,8 +5591,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
- SmallVector<int, 16> MaskVals;
- SVOp->getMask(MaskVals);
+ ArrayRef<int> MaskVals = SVOp->getMask();
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
@@ -5705,7 +5610,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (TLI.getSubtarget()->hasSSSE3orAVX()) {
+ if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
@@ -5836,7 +5741,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
unsigned NewWidth = (NumElems == 4) ? 2 : 4;
EVT NewVT;
switch (VT.getSimpleVT().SimpleTy) {
- default: assert(false && "Unexpected!");
+ default: llvm_unreachable("Unexpected!");
case MVT::v4f32: NewVT = MVT::v2f64; break;
case MVT::v4i32: NewVT = MVT::v2i64; break;
case MVT::v8i16: NewVT = MVT::v4i32; break;
@@ -5902,96 +5807,106 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
OpVT, SrcOp)));
}
-/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
-/// shuffle node referes to only one lane in the sources.
-static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
- int HalfSize = NumElems/2;
- SmallVector<int, 16> M;
- SVOp->getMask(M);
- bool MatchA = false, MatchB = false;
-
- for (int l = 0; l < NumElems*2; l += HalfSize) {
- if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
- MatchA = true;
- break;
- }
- }
-
- for (int l = 0; l < NumElems*2; l += HalfSize) {
- if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
- MatchB = true;
- break;
- }
- }
-
- return MatchA && MatchB;
-}
-
/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
- if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
- // If each half of a vector shuffle node referes to only one lane in the
- // source vectors, extract each used 128-bit lane and shuffle them using
- // 128-bit shuffles. Then, concatenate the results. Otherwise leave
- // the work to the legalizer.
- DebugLoc dl = SVOp->getDebugLoc();
- EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
- int HalfSize = NumElems/2;
-
- // Extract the reference for each half
- int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
- int FstVecOpNum = 0, SndVecOpNum = 0;
- for (int i = 0; i < HalfSize; ++i) {
- int Elt = SVOp->getMaskElt(i);
- if (SVOp->getMaskElt(i) < 0)
+ EVT VT = SVOp->getValueType(0);
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumLaneElems = NumElems / 2;
+
+ int MinRange[2][2] = { { static_cast<int>(NumElems),
+ static_cast<int>(NumElems) },
+ { static_cast<int>(NumElems),
+ static_cast<int>(NumElems) } };
+ int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } };
+
+ // Collect used ranges for each source in each lane
+ for (unsigned l = 0; l < 2; ++l) {
+ unsigned LaneStart = l*NumLaneElems;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ int Idx = SVOp->getMaskElt(i+LaneStart);
+ if (Idx < 0)
continue;
- FstVecOpNum = Elt/NumElems;
- FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
- break;
+
+ int Input = 0;
+ if (Idx >= (int)NumElems) {
+ Idx -= NumElems;
+ Input = 1;
+ }
+
+ if (Idx > MaxRange[l][Input])
+ MaxRange[l][Input] = Idx;
+ if (Idx < MinRange[l][Input])
+ MinRange[l][Input] = Idx;
}
- for (int i = HalfSize; i < NumElems; ++i) {
- int Elt = SVOp->getMaskElt(i);
- if (SVOp->getMaskElt(i) < 0)
+ }
+
+ // Make sure each range is 128-bits
+ int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } };
+ for (unsigned l = 0; l < 2; ++l) {
+ for (unsigned Input = 0; Input < 2; ++Input) {
+ if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0)
continue;
- SndVecOpNum = Elt/NumElems;
- SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
- break;
+
+ if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems)
+ ExtractIdx[l][Input] = 0;
+ else if (MinRange[l][Input] >= (int)NumLaneElems &&
+ MaxRange[l][Input] < (int)NumElems)
+ ExtractIdx[l][Input] = NumLaneElems;
+ else
+ return SDValue();
}
+ }
- // Extract the subvectors
- SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
- DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
- SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
- DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+ DebugLoc dl = SVOp->getDebugLoc();
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ SDValue Ops[2][2];
+ for (unsigned l = 0; l < 2; ++l) {
+ for (unsigned Input = 0; Input < 2; ++Input) {
+ if (ExtractIdx[l][Input] >= 0)
+ Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input),
+ DAG.getConstant(ExtractIdx[l][Input], MVT::i32),
+ DAG, dl);
+ else
+ Ops[l][Input] = DAG.getUNDEF(NVT);
+ }
+ }
- // Generate 128-bit shuffles
- SmallVector<int, 16> MaskV1, MaskV2;
- for (int i = 0; i < HalfSize; ++i) {
- int Elt = SVOp->getMaskElt(i);
- MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ // Generate 128-bit shuffles
+ SmallVector<int, 16> Mask1, Mask2;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (Elt >= (int)NumElems) {
+ Elt %= NumLaneElems;
+ Elt += NumLaneElems;
+ } else if (Elt >= 0) {
+ Elt %= NumLaneElems;
}
- for (int i = HalfSize; i < NumElems; ++i) {
- int Elt = SVOp->getMaskElt(i);
- MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ Mask1.push_back(Elt);
+ }
+ for (unsigned i = NumLaneElems; i != NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (Elt >= (int)NumElems) {
+ Elt %= NumLaneElems;
+ Elt += NumLaneElems;
+ } else if (Elt >= 0) {
+ Elt %= NumLaneElems;
}
-
- EVT NVT = V1.getValueType();
- V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
- V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
-
- // Concatenate the result back
- SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ Mask2.push_back(Elt);
}
- return SDValue();
+ SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]);
+ SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]);
+
+ // Concatenate the result back
+ SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
}
/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6005,11 +5920,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
- SmallVector<std::pair<int, int>, 8> Locs;
- Locs.resize(4);
- SmallVector<int, 8> Mask1(4U, -1);
- SmallVector<int, 8> PermMask;
- SVOp->getMask(PermMask);
+ std::pair<int, int> Locs[4];
+ int Mask1[] = { -1, -1, -1, -1 };
+ SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
unsigned NumHi = 0;
unsigned NumLo = 0;
@@ -6039,17 +5952,14 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
// vector operands, put the elements into the right order.
V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
- SmallVector<int, 8> Mask2(4U, -1);
+ int Mask2[] = { -1, -1, -1, -1 };
- for (unsigned i = 0; i != 4; ++i) {
- if (Locs[i].first == -1)
- continue;
- else {
+ for (unsigned i = 0; i != 4; ++i)
+ if (Locs[i].first != -1) {
unsigned Idx = (i < 2) ? 0 : 4;
Idx += Locs[i].first * 2 + Locs[i].second;
Mask2[i] = Idx;
}
- }
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
} else if (NumLo == 3 || NumHi == 3) {
@@ -6102,18 +6012,16 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
// Break it into (shuffle shuffle_hi, shuffle_lo).
- Locs.clear();
- Locs.resize(4);
- SmallVector<int,8> LoMask(4U, -1);
- SmallVector<int,8> HiMask(4U, -1);
+ int LoMask[] = { -1, -1, -1, -1 };
+ int HiMask[] = { -1, -1, -1, -1 };
- SmallVector<int,8> *MaskPtr = &LoMask;
+ int *MaskPtr = LoMask;
unsigned MaskIdx = 0;
unsigned LoIdx = 0;
unsigned HiIdx = 2;
for (unsigned i = 0; i != 4; ++i) {
if (i == 2) {
- MaskPtr = &HiMask;
+ MaskPtr = HiMask;
MaskIdx = 1;
LoIdx = 0;
HiIdx = 2;
@@ -6123,26 +6031,21 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
Locs[i] = std::make_pair(-1, -1);
} else if (Idx < 4) {
Locs[i] = std::make_pair(MaskIdx, LoIdx);
- (*MaskPtr)[LoIdx] = Idx;
+ MaskPtr[LoIdx] = Idx;
LoIdx++;
} else {
Locs[i] = std::make_pair(MaskIdx, HiIdx);
- (*MaskPtr)[HiIdx] = Idx;
+ MaskPtr[HiIdx] = Idx;
HiIdx++;
}
}
SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
- SmallVector<int, 8> MaskOps;
- for (unsigned i = 0; i != 4; ++i) {
- if (Locs[i].first == -1) {
- MaskOps.push_back(-1);
- } else {
- unsigned Idx = Locs[i].first * 4 + Locs[i].second;
- MaskOps.push_back(Idx);
- }
- }
+ int MaskOps[] = { -1, -1, -1, -1 };
+ for (unsigned i = 0; i != 4; ++i)
+ if (Locs[i].first != -1)
+ MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
}
@@ -6220,35 +6123,41 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
+ // If we are accessing the upper part of a YMM register
+ // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of
+ // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point
+ // because the legalization of N did not happen yet.
+ if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256)
+ return false;
+
// Skip one more bit_convert if necessary
- if (V.getOpcode() == ISD::BITCAST)
+ if (V.getOpcode() == ISD::BITCAST) {
+ if (!V.hasOneUse())
+ return false;
V = V.getOperand(0);
+ }
- if (ISD::isNormalLoad(V.getNode())) {
- // Is the original load suitable?
- LoadSDNode *LN0 = cast<LoadSDNode>(V);
+ if (!ISD::isNormalLoad(V.getNode()))
+ return false;
- // FIXME: avoid the multi-use bug that is preventing lots of
- // of foldings to be detected, this is still wrong of course, but
- // give the temporary desired behavior, and if it happens that
- // the load has real more uses, during isel it will not fold, and
- // will generate poor code.
- if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
- return false;
+ // Is the original load suitable?
+ LoadSDNode *LN0 = cast<LoadSDNode>(V);
- if (!HasShuffleIntoBitcast)
- return true;
+ if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+ return false;
- // If there's a bitcast before the shuffle, check if the load type and
- // alignment is valid.
- unsigned Align = LN0->getAlignment();
- unsigned NewAlign =
- TLI.getTargetData()->getABITypeAlignment(
- VT.getTypeForEVT(*DAG.getContext()));
+ if (!HasShuffleIntoBitcast)
+ return true;
- if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
- return false;
- }
+ // If there's a bitcast before the shuffle, check if the load type and
+ // alignment is valid.
+ unsigned Align = LN0->getAlignment();
+ unsigned NewAlign =
+ TLI.getTargetData()->getABITypeAlignment(
+ VT.getTypeForEVT(*DAG.getContext()));
+
+ if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+ return false;
return true;
}
@@ -6266,14 +6175,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
static
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
- bool HasXMMInt) {
+ bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
assert(VT != MVT::v2i64 && "unsupported shuffle type");
- if (HasXMMInt && VT == MVT::v2f64)
+ if (HasSSE2 && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
// v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
@@ -6299,24 +6208,8 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
}
-static inline unsigned getSHUFPOpcode(EVT VT) {
- switch(VT.getSimpleVT().SimpleTy) {
- case MVT::v8i32: // Use fp unit for int unpack.
- case MVT::v8f32:
- case MVT::v4i32: // Use fp unit for int unpack.
- case MVT::v4f32: return X86ISD::SHUFPS;
- case MVT::v4i64: // Use fp unit for int unpack.
- case MVT::v4f64:
- case MVT::v2i64: // Use fp unit for int unpack.
- case MVT::v2f64: return X86ISD::SHUFPD;
- default:
- llvm_unreachable("Unknown type for shufp*");
- }
- return 0;
-}
-
static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
@@ -6342,7 +6235,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
if (CanFoldLoad) {
- if (HasXMMInt && NumElems == 2)
+ if (HasSSE2 && NumElems == 2)
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
@@ -6357,10 +6250,10 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
// this is horrible, but will stay like this until we move all shuffle
// matching to x86 specific nodes. Note that for the 1st condition all
// types are matched with movsd.
- if (HasXMMInt) {
+ if (HasSSE2) {
// FIXME: isMOVLMask should be checked and matched before getMOVLP,
// as to remove this logic from here, as much as possible
- if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+ if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
}
@@ -6368,8 +6261,8 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
assert(VT != MVT::v4i32 && "unsupported shuffle type");
// Invert the operand order and use SHUFPS to match it.
- return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
- X86::getShuffleSHUFImmediate(SVOp), DAG);
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+ getShuffleSHUFImmediate(SVOp), DAG);
}
static
@@ -6383,7 +6276,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
SDValue V2 = Op.getOperand(1);
if (isZeroShuffle(SVOp))
- return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+ return getZeroVector(VT, Subtarget, DAG, dl);
// Handle splat operations
if (SVOp->isSplat()) {
@@ -6397,8 +6290,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
return Op;
// Use vbroadcast whenever the splat comes from a foldable load
- SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
- if (Subtarget->hasAVX() && LD.getNode())
+ SDValue LD = isVectorBroadcast(Op, Subtarget);
+ if (LD.getNode())
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
// Handle splats by matching through known shuffle masks
@@ -6417,21 +6310,26 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
} else if ((VT == MVT::v4i32 ||
- (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
+ (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
// FIXME: Figure out a cleaner way to do this.
// Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode()) {
- if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
- return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+ EVT NewVT = NewOp.getValueType();
+ if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+ NewVT, true, false))
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
DAG, Subtarget, dl);
}
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
- if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
- return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
- DAG, Subtarget, dl);
+ if (NewOp.getNode()) {
+ EVT NewVT = NewOp.getValueType();
+ if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
+ DAG, Subtarget, dl);
+ }
}
}
return SDValue();
@@ -6445,10 +6343,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
unsigned NumElems = VT.getVectorNumElements();
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
bool V1IsSplat = false;
bool V2IsSplat = false;
- bool HasXMMInt = Subtarget->hasXMMInt();
+ bool HasSSE2 = Subtarget->hasSSE2();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
MachineFunction &MF = DAG.getMachineFunction();
@@ -6456,7 +6355,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
- assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef");
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
// Vector shuffle lowering takes 3 steps:
//
@@ -6479,38 +6381,43 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (NewOp.getNode())
return NewOp;
+ SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
// NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
// unpckh_undef). Only use pshufd if speed is more important than size.
- if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
+ if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
+ if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
- if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
+ if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
V2IsUndef && RelaxedMayFoldVectorLoad(V1))
return getMOVDDup(Op, dl, V1, DAG);
- if (X86::isMOVHLPS_v_undef_Mask(SVOp))
+ if (isMOVHLPS_v_undef_Mask(M, VT))
return getMOVHighToLow(Op, dl, DAG);
// Use to match splats
- if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
+ if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
(VT == MVT::v2f64 || VT == MVT::v2i64))
return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
- if (X86::isPSHUFDMask(SVOp)) {
+ if (isPSHUFDMask(M, VT)) {
// The actual implementation will match the mask in the if above and then
// during isel it can match several different instructions, not only pshufd
// as its name says, sad but true, emulate the behavior for now...
- if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
- return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+ if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+ return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+ unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
- unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
+ if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
+ return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
- if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
+ if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
- return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
TargetMask, DAG);
}
@@ -6518,7 +6425,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool isLeft = false;
unsigned ShAmt = 0;
SDValue ShVal;
- bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
@@ -6527,11 +6434,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
}
- if (X86::isMOVLMask(SVOp)) {
+ if (isMOVLMask(M, VT)) {
if (ISD::isBuildVectorAllZeros(V1.getNode()))
return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
- if (!X86::isMOVLPMask(SVOp)) {
- if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
+ if (!isMOVLPMask(M, VT)) {
+ if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -6540,27 +6447,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
}
// FIXME: fold these into legal mask.
- if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2))
- return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
+ if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
+ return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
- if (X86::isMOVHLPSMask(SVOp))
+ if (isMOVHLPSMask(M, VT))
return getMOVHighToLow(Op, dl, DAG);
- if (X86::isMOVSHDUPMask(SVOp, Subtarget))
+ if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
- if (X86::isMOVSLDUPMask(SVOp, Subtarget))
+ if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
- if (X86::isMOVLPMask(SVOp))
- return getMOVLP(Op, dl, DAG, HasXMMInt);
+ if (isMOVLPMask(M, VT))
+ return getMOVLP(Op, dl, DAG, HasSSE2);
- if (ShouldXformToMOVHLPS(SVOp) ||
- ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+ if (ShouldXformToMOVHLPS(M, VT) ||
+ ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
return CommuteVectorShuffle(SVOp, DAG);
if (isShift) {
- // No better options. Use a vshl / vsrl.
+ // No better options. Use a vshldq / vsrldq.
EVT EltVT = VT.getVectorElementType();
ShAmt *= EltVT.getSizeInBits();
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
@@ -6573,18 +6480,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
V2IsSplat = isSplatVector(V2.getNode());
// Canonicalize the splat or undef, if present, to be on the RHS.
- if (V1IsSplat && !V2IsSplat) {
- Op = CommuteVectorShuffle(SVOp, DAG);
- SVOp = cast<ShuffleVectorSDNode>(Op);
- V1 = SVOp->getOperand(0);
- V2 = SVOp->getOperand(1);
+ if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+ CommuteVectorShuffleMask(M, NumElems);
+ std::swap(V1, V2);
std::swap(V1IsSplat, V2IsSplat);
Commuted = true;
}
- SmallVector<int, 32> M;
- SVOp->getMask(M);
-
if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
// Shuffling low element of v1 into undef, just return v1.
if (V2IsUndef)
@@ -6604,41 +6506,40 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (V2IsSplat) {
// Normalize mask so all entries that point to V2 points to its first
// element then try to match unpck{h|l} again. If match, return a
- // new vector_shuffle with the corrected mask.
- SDValue NewMask = NormalizeMask(SVOp, DAG);
- ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
- if (NSVOp != SVOp) {
- if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) {
- return NewMask;
- } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) {
- return NewMask;
- }
+ // new vector_shuffle with the corrected mask.p
+ SmallVector<int, 8> NewMask(M.begin(), M.end());
+ NormalizeMask(NewMask, NumElems);
+ if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+ } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
}
}
if (Commuted) {
// Commute is back and try unpck* again.
// FIXME: this seems wrong.
- SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
- ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+ CommuteVectorShuffleMask(M, NumElems);
+ std::swap(V1, V2);
+ std::swap(V1IsSplat, V2IsSplat);
+ Commuted = false;
- if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
+ if (isUNPCKLMask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
+ if (isUNPCKHMask(M, VT, HasAVX2))
+ return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
}
// Normalize the node to match x86 shuffle ops if needed
- if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) ||
- isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true)))
+ if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
return CommuteVectorShuffle(SVOp, DAG);
// The checks below are all present in isShuffleMaskLegal, but they are
// inlined here right now to enable us to directly emit target specific
// nodes, and remove one by one until they don't return Op anymore.
- if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
+ if (isPALIGNRMask(M, VT, Subtarget))
return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
getShufflePALIGNRImmediate(SVOp),
DAG);
@@ -6651,21 +6552,21 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (isPSHUFHWMask(M, VT))
return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
- X86::getShufflePSHUFHWImmediate(SVOp),
+ getShufflePSHUFHWImmediate(SVOp),
DAG);
if (isPSHUFLWMask(M, VT))
return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
- X86::getShufflePSHUFLWImmediate(SVOp),
+ getShufflePSHUFLWImmediate(SVOp),
DAG);
- if (isSHUFPMask(M, VT))
- return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
- X86::getShuffleSHUFImmediate(SVOp), DAG);
+ if (isSHUFPMask(M, VT, HasAVX))
+ return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+ getShuffleSHUFImmediate(SVOp), DAG);
- if (isUNPCKL_v_undef_Mask(M, VT))
+ if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (isUNPCKH_v_undef_Mask(M, VT))
+ if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
//===--------------------------------------------------------------------===//
@@ -6678,20 +6579,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
// Handle VPERMILPS/D* permutations
- if (isVPERMILPMask(M, VT, HasAVX))
+ if (isVPERMILPMask(M, VT, HasAVX)) {
+ if (HasAVX2 && VT == MVT::v8i32)
+ return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+ getShuffleSHUFImmediate(SVOp), DAG);
return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
- getShuffleVPERMILPImmediate(SVOp), DAG);
+ getShuffleSHUFImmediate(SVOp), DAG);
+ }
// Handle VPERM2F128/VPERM2I128 permutations
if (isVPERM2X128Mask(M, VT, HasAVX))
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
- // Handle VSHUFPS/DY permutations
- if (isVSHUFPYMask(M, VT, HasAVX))
- return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
- getShuffleVSHUFPYImmediate(SVOp), DAG);
-
//===--------------------------------------------------------------------===//
// Since no target specific shuffle was selected for this generic one,
// lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -6810,7 +6710,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
- if (Subtarget->hasSSE41orAVX()) {
+ if (Subtarget->hasSSE41()) {
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
if (Res.getNode())
return Res;
@@ -6952,7 +6852,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
}
- if (Subtarget->hasSSE41orAVX())
+ if (Subtarget->hasSSE41())
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
if (EltVT == MVT::i8)
@@ -7408,19 +7308,77 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
Chain.getValue(1));
- }
+ } else if (Subtarget->isTargetWindows()) {
+ // Just use the implicit TLS architecture
+ // Need to generate someting similar to:
+ // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+ // ; from TEB
+ // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
+ // mov rcx, qword [rdx+rcx*8]
+ // mov eax, .tls$:tlsvar
+ // [rax+rcx] contains the address
+ // Windows 64bit: gs:0x58
+ // Windows 32bit: fs:__tls_array
- assert(false &&
- "TLS not implemented for this target.");
+ // If GV is an alias then use the aliasee for determining
+ // thread-localness.
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ GV = GA->resolveAliasedGlobal(false);
+ DebugLoc dl = GA->getDebugLoc();
+ SDValue Chain = DAG.getEntryNode();
- llvm_unreachable("Unreachable");
- return SDValue();
+ // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+ // %gs:0x58 (64-bit).
+ Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+ ? Type::getInt8PtrTy(*DAG.getContext(),
+ 256)
+ : Type::getInt32PtrTy(*DAG.getContext(),
+ 257));
+
+ SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
+ Subtarget->is64Bit()
+ ? DAG.getIntPtrConstant(0x58)
+ : DAG.getExternalSymbol("_tls_array",
+ getPointerTy()),
+ MachinePointerInfo(Ptr),
+ false, false, false, 0);
+
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+ if (Subtarget->is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+ IDX, MachinePointerInfo(), MVT::i32,
+ false, false, 0);
+ else
+ IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+ false, false, false, 0);
+
+ SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+ getPointerTy());
+ IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+ SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+ res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+ false, false, false, 0);
+
+ // Get the offset of start of .tls section
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+ }
+
+ llvm_unreachable("TLS not implemented for this target.");
}
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
-/// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
@@ -7561,85 +7519,65 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SelectionDAG &DAG) const {
- // This algorithm is not obvious. Here it is in C code, more or less:
+ // This algorithm is not obvious. Here it is what we're trying to output:
/*
- double uint64_to_double( uint32_t hi, uint32_t lo ) {
- static const __m128i exp = { 0x4330000045300000ULL, 0 };
- static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
-
- // Copy ints to xmm registers.
- __m128i xh = _mm_cvtsi32_si128( hi );
- __m128i xl = _mm_cvtsi32_si128( lo );
-
- // Combine into low half of a single xmm register.
- __m128i x = _mm_unpacklo_epi32( xh, xl );
- __m128d d;
- double sd;
-
- // Merge in appropriate exponents to give the integer bits the right
- // magnitude.
- x = _mm_unpacklo_epi32( x, exp );
-
- // Subtract away the biases to deal with the IEEE-754 double precision
- // implicit 1.
- d = _mm_sub_pd( (__m128d) x, bias );
-
- // All conversions up to here are exact. The correctly rounded result is
- // calculated using the current rounding mode using the following
- // horizontal add.
- d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
- _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
- // store doesn't really need to be here (except
- // maybe to zero the other double)
- return sd;
- }
+ movq %rax, %xmm0
+ punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+ subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+ #ifdef __SSE3__
+ haddpd %xmm0, %xmm0
+ #else
+ pshufd $0x4e, %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+ #endif
*/
DebugLoc dl = Op.getDebugLoc();
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
- SmallVector<Constant*,4> CV0;
- CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
- CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
- CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
- CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
- Constant *C0 = ConstantVector::get(CV0);
+ const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ Constant *C0 = ConstantDataVector::get(*Context, CV0);
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
SmallVector<Constant*,2> CV1;
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
- SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Op.getOperand(0),
- DAG.getIntPtrConstant(1)));
- SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Op.getOperand(0),
- DAG.getIntPtrConstant(0)));
- SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+ // Load the 64-bit value into an XMM register.
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Op.getOperand(0));
SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
- SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
- SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
+ SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+ CLod0);
+
SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
+ SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Result;
- // Add the halves; easiest way is to swap them into another reg first.
- int ShufMask[2] = { 1, -1 };
- SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
- DAG.getUNDEF(MVT::v2f64), ShufMask);
- SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+ if (Subtarget->hasSSE3()) {
+ // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+ Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+ } else {
+ SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+ SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+ S2F, 0x4E, DAG);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+ Sub);
+ }
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0));
}
@@ -7656,8 +7594,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
Op.getOperand(0));
// Zero out the upper parts of the register.
- Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
- DAG);
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7709,6 +7646,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return LowerUINT_TO_FP_i64(Op, DAG);
else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG);
+ else if (Subtarget->is64Bit() &&
+ SrcVT == MVT::i64 && DstVT == MVT::f32)
+ return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
@@ -7728,7 +7668,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot, MachinePointerInfo(),
+ StackSlot, MachinePointerInfo(),
false, false, 0);
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
@@ -7776,19 +7716,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
}
std::pair<SDValue,SDValue> X86TargetLowering::
-FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
DebugLoc DL = Op.getDebugLoc();
EVT DstTy = Op.getValueType();
- if (!IsSigned) {
+ if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
DstTy = MVT::i64;
}
assert(DstTy.getSimpleVT() <= MVT::i64 &&
DstTy.getSimpleVT() >= MVT::i16 &&
- "Unknown FP_TO_SINT to lower!");
+ "Unknown FP_TO_INT to lower!");
// These are really Legal.
if (DstTy == MVT::i32 &&
@@ -7799,26 +7739,29 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
return std::make_pair(SDValue(), SDValue());
- // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
- // stack slot.
+ // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+ // stack slot, or into the FTOL runtime function.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getSizeInBits()/8;
int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-
-
unsigned Opc;
- switch (DstTy.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
- case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
- case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
- case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
- }
+ if (!IsSigned && isIntegerTypeFTOL(DstTy))
+ Opc = X86ISD::WIN_FTOL;
+ else
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
EVT TheVT = Op.getOperand(0).getValueType();
+ // FIXME This causes a redundant load/store if the SSE-class value is already
+ // in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot,
@@ -7843,12 +7786,26 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOStore, MemSize, MemSize);
- // Build the FP_TO_INT*_IN_MEM
- SDValue Ops[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, 3, DstTy, MMO);
-
- return std::make_pair(FIST, StackSlot);
+ if (Opc != X86ISD::WIN_FTOL) {
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ Ops, 3, DstTy, MMO);
+ return std::make_pair(FIST, StackSlot);
+ } else {
+ SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue),
+ Chain, Value);
+ SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+ MVT::i32, ftol.getValue(1));
+ SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+ MVT::i32, eax.getValue(2));
+ SDValue Ops[] = { eax, edx };
+ SDValue pair = IsReplace
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
+ : DAG.getMergeValues(Ops, 2, DL);
+ return std::make_pair(pair, SDValue());
+ }
}
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
@@ -7856,27 +7813,37 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
if (Op.getValueType().isVector())
return SDValue();
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
if (FIST.getNode() == 0) return Op;
- // Load the result.
- return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
- FIST, StackSlot, MachinePointerInfo(),
- false, false, false, 0);
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+ else
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
SelectionDAG &DAG) const {
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+ /*IsSigned=*/ false, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
assert(FIST.getNode() && "Unexpected failure");
- // Load the result.
- return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
- FIST, StackSlot, MachinePointerInfo(),
- false, false, false, 0);
+ if (StackSlot.getNode())
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, MachinePointerInfo(),
+ false, false, false, 0);
+ else
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7887,15 +7854,14 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
EVT EltVT = VT;
if (VT.isVector())
EltVT = VT.getVectorElementType();
- SmallVector<Constant*,4> CV;
+ Constant *C;
if (EltVT == MVT::f64) {
- Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
- CV.assign(2, C);
+ C = ConstantVector::getSplat(2,
+ ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
} else {
- Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
- CV.assign(4, C);
+ C = ConstantVector::getSplat(4,
+ ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
}
- Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
@@ -7913,15 +7879,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
}
- SmallVector<Constant*,8> CV;
- if (EltVT == MVT::f64) {
- Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
- CV.assign(NumElts, C);
- } else {
- Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
- CV.assign(NumElts, C);
- }
- Constant *C = ConstantVector::get(CV);
+ Constant *C;
+ if (EltVT == MVT::f64)
+ C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+ else
+ C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+ C = ConstantVector::getSplat(NumElts, C);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
@@ -8353,9 +8316,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
if (isFP) {
unsigned SSECC = 8;
EVT EltVT = Op0.getValueType().getVectorElementType();
- assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
- unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
bool Swap = false;
// SSE Condition code mapping:
@@ -8395,19 +8357,24 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
if (SSECC == 8) {
if (SetCCOpcode == ISD::SETUEQ) {
SDValue UNORD, EQ;
- UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
- EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+ UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(3, MVT::i8));
+ EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(0, MVT::i8));
return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
} else if (SetCCOpcode == ISD::SETONE) {
SDValue ORD, NEQ;
- ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
- NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+ ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(7, MVT::i8));
+ NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(4, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
}
llvm_unreachable("Illegal FP comparison");
}
// Handle all other FP comparisons here.
- return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+ return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(SSECC, MVT::i8));
}
// Break 256-bit integer vector compare into smaller ones.
@@ -8417,38 +8384,30 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
- unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+ unsigned Opc = 0;
bool Swap = false, Invert = false, FlipSigns = false;
- switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
- default: break;
- case MVT::i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
- case MVT::i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
- case MVT::i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
- case MVT::i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
- }
-
switch (SetCCOpcode) {
default: break;
case ISD::SETNE: Invert = true;
- case ISD::SETEQ: Opc = EQOpc; break;
+ case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
case ISD::SETLT: Swap = true;
- case ISD::SETGT: Opc = GTOpc; break;
+ case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
case ISD::SETGE: Swap = true;
- case ISD::SETLE: Opc = GTOpc; Invert = true; break;
+ case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break;
case ISD::SETULT: Swap = true;
- case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+ case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
case ISD::SETUGE: Swap = true;
- case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+ case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
}
if (Swap)
std::swap(Op0, Op1);
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
- if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX())
+ if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
return SDValue();
- if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX())
+ if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
return SDValue();
// Since SSE has no unsigned integer comparisons, we need to flip the sign
@@ -9113,7 +9072,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(!getTargetMachine().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
.getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
- Subtarget->hasXMM());
+ Subtarget->hasSSE1());
}
// Insert VAARG_64 node into the DAG
@@ -9159,6 +9118,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
+// getTargetVShiftNOde - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+ SDValue SrcOp, SDValue ShAmt,
+ SelectionDAG &DAG) {
+ assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+ if (isa<ConstantSDNode>(ShAmt)) {
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+ }
+ }
+
+ // Change opcode to non-immediate version
+ switch (Opc) {
+ default: llvm_unreachable("Unknown target vector shift node");
+ case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+ case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+ case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+ }
+
+ // Need to build a vector containing shift amount
+ // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+ SDValue ShOps[4];
+ ShOps[0] = ShAmt;
+ ShOps[1] = DAG.getConstant(0, MVT::i32);
+ ShOps[2] = DAG.getUNDEF(MVT::i32);
+ ShOps[3] = DAG.getUNDEF(MVT::i32);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+ ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
SDValue
X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
@@ -9193,7 +9189,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
unsigned Opc = 0;
ISD::CondCode CC = ISD::SETCC_INVALID;
switch (IntNo) {
- default: break;
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse_comieq_ss:
case Intrinsic::x86_sse2_comieq_sd:
Opc = X86ISD::COMI;
@@ -9265,7 +9261,201 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(X86CC, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ // XOP comparison intrinsics
+ case Intrinsic::x86_xop_vpcomltb:
+ case Intrinsic::x86_xop_vpcomltw:
+ case Intrinsic::x86_xop_vpcomltd:
+ case Intrinsic::x86_xop_vpcomltq:
+ case Intrinsic::x86_xop_vpcomltub:
+ case Intrinsic::x86_xop_vpcomltuw:
+ case Intrinsic::x86_xop_vpcomltud:
+ case Intrinsic::x86_xop_vpcomltuq:
+ case Intrinsic::x86_xop_vpcomleb:
+ case Intrinsic::x86_xop_vpcomlew:
+ case Intrinsic::x86_xop_vpcomled:
+ case Intrinsic::x86_xop_vpcomleq:
+ case Intrinsic::x86_xop_vpcomleub:
+ case Intrinsic::x86_xop_vpcomleuw:
+ case Intrinsic::x86_xop_vpcomleud:
+ case Intrinsic::x86_xop_vpcomleuq:
+ case Intrinsic::x86_xop_vpcomgtb:
+ case Intrinsic::x86_xop_vpcomgtw:
+ case Intrinsic::x86_xop_vpcomgtd:
+ case Intrinsic::x86_xop_vpcomgtq:
+ case Intrinsic::x86_xop_vpcomgtub:
+ case Intrinsic::x86_xop_vpcomgtuw:
+ case Intrinsic::x86_xop_vpcomgtud:
+ case Intrinsic::x86_xop_vpcomgtuq:
+ case Intrinsic::x86_xop_vpcomgeb:
+ case Intrinsic::x86_xop_vpcomgew:
+ case Intrinsic::x86_xop_vpcomged:
+ case Intrinsic::x86_xop_vpcomgeq:
+ case Intrinsic::x86_xop_vpcomgeub:
+ case Intrinsic::x86_xop_vpcomgeuw:
+ case Intrinsic::x86_xop_vpcomgeud:
+ case Intrinsic::x86_xop_vpcomgeuq:
+ case Intrinsic::x86_xop_vpcomeqb:
+ case Intrinsic::x86_xop_vpcomeqw:
+ case Intrinsic::x86_xop_vpcomeqd:
+ case Intrinsic::x86_xop_vpcomeqq:
+ case Intrinsic::x86_xop_vpcomequb:
+ case Intrinsic::x86_xop_vpcomequw:
+ case Intrinsic::x86_xop_vpcomequd:
+ case Intrinsic::x86_xop_vpcomequq:
+ case Intrinsic::x86_xop_vpcomneb:
+ case Intrinsic::x86_xop_vpcomnew:
+ case Intrinsic::x86_xop_vpcomned:
+ case Intrinsic::x86_xop_vpcomneq:
+ case Intrinsic::x86_xop_vpcomneub:
+ case Intrinsic::x86_xop_vpcomneuw:
+ case Intrinsic::x86_xop_vpcomneud:
+ case Intrinsic::x86_xop_vpcomneuq:
+ case Intrinsic::x86_xop_vpcomfalseb:
+ case Intrinsic::x86_xop_vpcomfalsew:
+ case Intrinsic::x86_xop_vpcomfalsed:
+ case Intrinsic::x86_xop_vpcomfalseq:
+ case Intrinsic::x86_xop_vpcomfalseub:
+ case Intrinsic::x86_xop_vpcomfalseuw:
+ case Intrinsic::x86_xop_vpcomfalseud:
+ case Intrinsic::x86_xop_vpcomfalseuq:
+ case Intrinsic::x86_xop_vpcomtrueb:
+ case Intrinsic::x86_xop_vpcomtruew:
+ case Intrinsic::x86_xop_vpcomtrued:
+ case Intrinsic::x86_xop_vpcomtrueq:
+ case Intrinsic::x86_xop_vpcomtrueub:
+ case Intrinsic::x86_xop_vpcomtrueuw:
+ case Intrinsic::x86_xop_vpcomtrueud:
+ case Intrinsic::x86_xop_vpcomtrueuq: {
+ unsigned CC = 0;
+ unsigned Opc = 0;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_xop_vpcomltb:
+ case Intrinsic::x86_xop_vpcomltw:
+ case Intrinsic::x86_xop_vpcomltd:
+ case Intrinsic::x86_xop_vpcomltq:
+ CC = 0;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomltub:
+ case Intrinsic::x86_xop_vpcomltuw:
+ case Intrinsic::x86_xop_vpcomltud:
+ case Intrinsic::x86_xop_vpcomltuq:
+ CC = 0;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomleb:
+ case Intrinsic::x86_xop_vpcomlew:
+ case Intrinsic::x86_xop_vpcomled:
+ case Intrinsic::x86_xop_vpcomleq:
+ CC = 1;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomleub:
+ case Intrinsic::x86_xop_vpcomleuw:
+ case Intrinsic::x86_xop_vpcomleud:
+ case Intrinsic::x86_xop_vpcomleuq:
+ CC = 1;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomgtb:
+ case Intrinsic::x86_xop_vpcomgtw:
+ case Intrinsic::x86_xop_vpcomgtd:
+ case Intrinsic::x86_xop_vpcomgtq:
+ CC = 2;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomgtub:
+ case Intrinsic::x86_xop_vpcomgtuw:
+ case Intrinsic::x86_xop_vpcomgtud:
+ case Intrinsic::x86_xop_vpcomgtuq:
+ CC = 2;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomgeb:
+ case Intrinsic::x86_xop_vpcomgew:
+ case Intrinsic::x86_xop_vpcomged:
+ case Intrinsic::x86_xop_vpcomgeq:
+ CC = 3;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomgeub:
+ case Intrinsic::x86_xop_vpcomgeuw:
+ case Intrinsic::x86_xop_vpcomgeud:
+ case Intrinsic::x86_xop_vpcomgeuq:
+ CC = 3;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomeqb:
+ case Intrinsic::x86_xop_vpcomeqw:
+ case Intrinsic::x86_xop_vpcomeqd:
+ case Intrinsic::x86_xop_vpcomeqq:
+ CC = 4;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomequb:
+ case Intrinsic::x86_xop_vpcomequw:
+ case Intrinsic::x86_xop_vpcomequd:
+ case Intrinsic::x86_xop_vpcomequq:
+ CC = 4;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomneb:
+ case Intrinsic::x86_xop_vpcomnew:
+ case Intrinsic::x86_xop_vpcomned:
+ case Intrinsic::x86_xop_vpcomneq:
+ CC = 5;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomneub:
+ case Intrinsic::x86_xop_vpcomneuw:
+ case Intrinsic::x86_xop_vpcomneud:
+ case Intrinsic::x86_xop_vpcomneuq:
+ CC = 5;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomfalseb:
+ case Intrinsic::x86_xop_vpcomfalsew:
+ case Intrinsic::x86_xop_vpcomfalsed:
+ case Intrinsic::x86_xop_vpcomfalseq:
+ CC = 6;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomfalseub:
+ case Intrinsic::x86_xop_vpcomfalseuw:
+ case Intrinsic::x86_xop_vpcomfalseud:
+ case Intrinsic::x86_xop_vpcomfalseuq:
+ CC = 6;
+ Opc = X86ISD::VPCOMU;
+ break;
+ case Intrinsic::x86_xop_vpcomtrueb:
+ case Intrinsic::x86_xop_vpcomtruew:
+ case Intrinsic::x86_xop_vpcomtrued:
+ case Intrinsic::x86_xop_vpcomtrueq:
+ CC = 7;
+ Opc = X86ISD::VPCOM;
+ break;
+ case Intrinsic::x86_xop_vpcomtrueub:
+ case Intrinsic::x86_xop_vpcomtrueuw:
+ case Intrinsic::x86_xop_vpcomtrueud:
+ case Intrinsic::x86_xop_vpcomtrueuq:
+ CC = 7;
+ Opc = X86ISD::VPCOMU;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
+ DAG.getConstant(CC, MVT::i8));
+ }
+
// Arithmetic intrinsics.
+ case Intrinsic::x86_sse2_pmulu_dq:
+ case Intrinsic::x86_avx2_pmulu_dq:
+ return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
case Intrinsic::x86_avx_hadd_ps_256:
@@ -9278,6 +9468,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_avx_hsub_pd_256:
return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_ssse3_phadd_w_128:
+ case Intrinsic::x86_ssse3_phadd_d_128:
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phadd_d:
+ return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_ssse3_phsub_w_128:
+ case Intrinsic::x86_ssse3_phsub_d_128:
+ case Intrinsic::x86_avx2_phsub_w:
+ case Intrinsic::x86_avx2_phsub_d:
+ return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_d_256:
@@ -9294,6 +9496,33 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_avx2_psrav_d_256:
return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_ssse3_psign_b_128:
+ case Intrinsic::x86_ssse3_psign_w_128:
+ case Intrinsic::x86_ssse3_psign_d_128:
+ case Intrinsic::x86_avx2_psign_b:
+ case Intrinsic::x86_avx2_psign_w:
+ case Intrinsic::x86_avx2_psign_d:
+ return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_insertps:
+ return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::x86_avx_vperm2f128_ps_256:
+ case Intrinsic::x86_avx_vperm2f128_pd_256:
+ case Intrinsic::x86_avx_vperm2f128_si_256:
+ case Intrinsic::x86_avx2_vperm2i128:
+ return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::x86_avx_vpermil_ps:
+ case Intrinsic::x86_avx_vpermil_pd:
+ case Intrinsic::x86_avx_vpermil_ps_256:
+ case Intrinsic::x86_avx_vpermil_pd_256:
+ return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
@@ -9361,24 +9590,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
- // Fix vector shift instructions where the last operand is a non-immediate
- // i32 value.
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
+ // SSE/AVX shift intrinsics
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG);
+ // Fix vector shift instructions where the last operand is a non-immediate
+ // i32 value.
case Intrinsic::x86_mmx_pslli_w:
case Intrinsic::x86_mmx_pslli_d:
case Intrinsic::x86_mmx_pslli_q:
@@ -9392,103 +9650,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
return SDValue();
unsigned NewIntNo = 0;
- EVT ShAmtVT = MVT::v4i32;
switch (IntNo) {
- case Intrinsic::x86_sse2_pslli_w:
- NewIntNo = Intrinsic::x86_sse2_psll_w;
- break;
- case Intrinsic::x86_sse2_pslli_d:
- NewIntNo = Intrinsic::x86_sse2_psll_d;
- break;
- case Intrinsic::x86_sse2_pslli_q:
- NewIntNo = Intrinsic::x86_sse2_psll_q;
- break;
- case Intrinsic::x86_sse2_psrli_w:
- NewIntNo = Intrinsic::x86_sse2_psrl_w;
- break;
- case Intrinsic::x86_sse2_psrli_d:
- NewIntNo = Intrinsic::x86_sse2_psrl_d;
- break;
- case Intrinsic::x86_sse2_psrli_q:
- NewIntNo = Intrinsic::x86_sse2_psrl_q;
- break;
- case Intrinsic::x86_sse2_psrai_w:
- NewIntNo = Intrinsic::x86_sse2_psra_w;
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntNo = Intrinsic::x86_mmx_psll_w;
break;
- case Intrinsic::x86_sse2_psrai_d:
- NewIntNo = Intrinsic::x86_sse2_psra_d;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntNo = Intrinsic::x86_mmx_psll_d;
break;
- case Intrinsic::x86_avx2_pslli_w:
- NewIntNo = Intrinsic::x86_avx2_psll_w;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntNo = Intrinsic::x86_mmx_psll_q;
break;
- case Intrinsic::x86_avx2_pslli_d:
- NewIntNo = Intrinsic::x86_avx2_psll_d;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntNo = Intrinsic::x86_mmx_psrl_w;
break;
- case Intrinsic::x86_avx2_pslli_q:
- NewIntNo = Intrinsic::x86_avx2_psll_q;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntNo = Intrinsic::x86_mmx_psrl_d;
break;
- case Intrinsic::x86_avx2_psrli_w:
- NewIntNo = Intrinsic::x86_avx2_psrl_w;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntNo = Intrinsic::x86_mmx_psrl_q;
break;
- case Intrinsic::x86_avx2_psrli_d:
- NewIntNo = Intrinsic::x86_avx2_psrl_d;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntNo = Intrinsic::x86_mmx_psra_w;
break;
- case Intrinsic::x86_avx2_psrli_q:
- NewIntNo = Intrinsic::x86_avx2_psrl_q;
- break;
- case Intrinsic::x86_avx2_psrai_w:
- NewIntNo = Intrinsic::x86_avx2_psra_w;
- break;
- case Intrinsic::x86_avx2_psrai_d:
- NewIntNo = Intrinsic::x86_avx2_psra_d;
- break;
- default: {
- ShAmtVT = MVT::v2i32;
- switch (IntNo) {
- case Intrinsic::x86_mmx_pslli_w:
- NewIntNo = Intrinsic::x86_mmx_psll_w;
- break;
- case Intrinsic::x86_mmx_pslli_d:
- NewIntNo = Intrinsic::x86_mmx_psll_d;
- break;
- case Intrinsic::x86_mmx_pslli_q:
- NewIntNo = Intrinsic::x86_mmx_psll_q;
- break;
- case Intrinsic::x86_mmx_psrli_w:
- NewIntNo = Intrinsic::x86_mmx_psrl_w;
- break;
- case Intrinsic::x86_mmx_psrli_d:
- NewIntNo = Intrinsic::x86_mmx_psrl_d;
- break;
- case Intrinsic::x86_mmx_psrli_q:
- NewIntNo = Intrinsic::x86_mmx_psrl_q;
- break;
- case Intrinsic::x86_mmx_psrai_w:
- NewIntNo = Intrinsic::x86_mmx_psra_w;
- break;
- case Intrinsic::x86_mmx_psrai_d:
- NewIntNo = Intrinsic::x86_mmx_psra_d;
- break;
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- }
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntNo = Intrinsic::x86_mmx_psra_d;
break;
- }
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
}
// The vector shift intrinsics with scalars uses 32b shift amounts but
// the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
// to be zero.
- SDValue ShOps[4];
- ShOps[0] = ShAmt;
- ShOps[1] = DAG.getConstant(0, MVT::i32);
- if (ShAmtVT == MVT::v4i32) {
- ShOps[2] = DAG.getUNDEF(MVT::i32);
- ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
- } else {
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
+ DAG.getConstant(0, MVT::i32));
// FIXME this must be lowered to get rid of the invalid type.
- }
EVT VT = Op.getValueType();
ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
@@ -9828,7 +10023,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
+ SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
@@ -9836,26 +10032,41 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
Op = Op.getOperand(0);
if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
OpVT = MVT::i32;
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
}
- // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ // Issue a bsr (scan bits in reverse).
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // And xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+ Op = Op.getOperand(0);
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
// If src is zero (i.e. bsf sets ZF), returns NumBits.
SDValue Ops[] = {
Op,
- DAG.getConstant(NumBits, OpVT),
+ DAG.getConstant(NumBits, VT),
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
-
- if (VT == MVT::i8)
- Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
- return Op;
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
}
// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -9910,86 +10121,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
return Lower256IntArith(Op, DAG);
+ assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ "Only know how to lower V2I64/V4I64 multiply");
+
DebugLoc dl = Op.getDebugLoc();
+ // Ahi = psrlqi(a, 32);
+ // Bhi = psrlqi(b, 32);
+ //
+ // AloBlo = pmuludq(a, b);
+ // AloBhi = pmuludq(a, Bhi);
+ // AhiBlo = pmuludq(Ahi, b);
+
+ // AloBhi = psllqi(AloBhi, 32);
+ // AhiBlo = psllqi(AhiBlo, 32);
+ // return AloBlo + AloBhi + AhiBlo;
+
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
- if (VT == MVT::v4i64) {
- assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
+ SDValue ShAmt = DAG.getConstant(32, MVT::i32);
- // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
- // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
- // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
- // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
- // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
- //
- // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
- // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
- // return AloBlo + AloBhi + AhiBlo;
-
- SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
- A, DAG.getConstant(32, MVT::i32));
- SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
- B, DAG.getConstant(32, MVT::i32));
- SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
- A, B);
- SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
- A, Bhi);
- SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
- Ahi, B);
- AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
- AloBhi, DAG.getConstant(32, MVT::i32));
- AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
- AhiBlo, DAG.getConstant(32, MVT::i32));
- SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
- Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
- return Res;
- }
+ SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+ SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
- assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+ // Bit cast to 32-bit vectors for MULUDQ
+ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+ A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+ B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+ Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+ Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
- // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
- // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
- // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
- // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
- // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
- //
- // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
- // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
- // return AloBlo + AloBhi + AhiBlo;
+ SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+ SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+ SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+ AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+ AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
- SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
- A, DAG.getConstant(32, MVT::i32));
- SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
- B, DAG.getConstant(32, MVT::i32));
- SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
- A, B);
- SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
- A, Bhi);
- SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
- Ahi, B);
- AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
- AloBhi, DAG.getConstant(32, MVT::i32));
- AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
- AhiBlo, DAG.getConstant(32, MVT::i32));
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
- Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
- return Res;
+ return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
@@ -10000,7 +10171,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
SDValue Amt = Op.getOperand(1);
LLVMContext *Context = DAG.getContext();
- if (!Subtarget->hasXMMInt())
+ if (!Subtarget->hasSSE2())
return SDValue();
// Optimize shl/srl/sra with constant shift amount.
@@ -10009,119 +10180,93 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
uint64_t ShiftAmt = C->getZExtValue();
- if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) {
- // Make a large shift.
- SDValue SHL =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
- // Zero out the rightmost bits.
- SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
- }
-
- if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) {
- // Make a large shift.
- SDValue SRL =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
- // Zero out the leftmost bits.
- SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+ (Subtarget->hasAVX2() &&
+ (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+ if (Op.getOpcode() == ISD::SHL)
+ return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ if (Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+ return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
}
- if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
-
- if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+ if (VT == MVT::v16i8) {
+ if (Op.getOpcode() == ISD::SHL) {
+ // Make a large shift.
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+ // Zero out the rightmost bits.
+ SmallVector<SDValue, 16> V(16,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SHL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ }
+ if (Op.getOpcode() == ISD::SRL) {
+ // Make a large shift.
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+ // Zero out the leftmost bits.
+ SmallVector<SDValue, 16> V(16,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, SRL,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
}
+ if (Op.getOpcode() == ISD::SRA) {
+ if (ShiftAmt == 7) {
+ // R s>> 7 === R s< 0
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
- // R s>> a === ((R u>> a) ^ m) - m
- SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
- Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
- Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
- return Res;
+ // R s>> a === ((R u>> a) ^ m) - m
+ SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+ MVT::i8));
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+ return Res;
+ }
}
if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
+ SmallVector<SDValue, 32> V(32,
+ DAG.getConstant(uint8_t(-1U << ShiftAmt),
+ MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
- R, DAG.getConstant(ShiftAmt, MVT::i32));
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
+ DAG.getConstant(ShiftAmt, MVT::i32));
+ SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(32,
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+ MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
// R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
}
// R s>> a === ((R u>> a) ^ m) - m
@@ -10139,14 +10284,11 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
- Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
- Op.getOperand(1), DAG.getConstant(23, MVT::i32));
+ Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
+ DAG.getConstant(23, MVT::i32));
- ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
-
- std::vector<Constant*> CV(4, CI);
- Constant *C = ConstantVector::get(CV);
+ const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
+ Constant *C = ConstantDataVector::get(*Context, CV);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
@@ -10158,55 +10300,54 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+ assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+
// a = a << 5;
- Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
- Op.getOperand(1), DAG.getConstant(5, MVT::i32));
+ Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
+ DAG.getConstant(5, MVT::i32));
+ Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
- ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
- ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
+ // Turn 'a' into a mask suitable for VSELECT
+ SDValue VSelM = DAG.getConstant(0x80, VT);
+ SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+ SDValue CM1 = DAG.getConstant(0x0f, VT);
+ SDValue CM2 = DAG.getConstant(0x3f, VT);
+
+ // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+ SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
+ M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+ DAG.getConstant(4, MVT::i32), DAG);
+ M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
- std::vector<Constant*> CVM1(16, CM1);
- std::vector<Constant*> CVM2(16, CM2);
- Constant *C = ConstantVector::get(CVM1);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
- SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
-
- // r = pblendv(r, psllw(r & (char16)15, 4), a);
- M = DAG.getNode(ISD::AND, dl, VT, R, M);
- M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
- DAG.getConstant(4, MVT::i32));
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+ // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+ M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
+ M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+ DAG.getConstant(2, MVT::i32), DAG);
+ M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
- C = ConstantVector::get(CVM2);
- CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
- M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
-
- // r = pblendv(r, psllw(r & (char16)63, 2), a);
- M = DAG.getNode(ISD::AND, dl, VT, R, M);
- M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
- DAG.getConstant(2, MVT::i32));
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op, M, R);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+ OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+ OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
- // return pblendv(r, r+r, a);
- R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
+ // return VSELECT(r, r+r, a);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
DAG.getNode(ISD::ADD, dl, VT, R, R), R);
return R;
}
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.getSizeInBits() == 256) {
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10221,9 +10362,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// Constant shift amount
SmallVector<SDValue, 4> Amt1Csts;
SmallVector<SDValue, 4> Amt2Csts;
- for (int i = 0; i < NumElems/2; ++i)
+ for (unsigned i = 0; i != NumElems/2; ++i)
Amt1Csts.push_back(Amt->getOperand(i));
- for (int i = NumElems/2; i < NumElems; ++i)
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
@@ -10323,77 +10464,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+ SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
EVT VT = Op.getValueType();
- if (Subtarget->hasXMMInt() && VT.isVector()) {
- unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
- ExtraVT.getScalarType().getSizeInBits();
- SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+ if (!Subtarget->hasSSE2() || !VT.isVector())
+ return SDValue();
- unsigned SHLIntrinsicsID = 0;
- unsigned SRAIntrinsicsID = 0;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
+ unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+ ExtraVT.getScalarType().getSizeInBits();
+ SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v8i32:
+ case MVT::v16i16:
+ if (!Subtarget->hasAVX())
return SDValue();
- case MVT::v4i32:
- SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
- SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
- break;
- case MVT::v8i16:
- SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
- SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
- break;
- case MVT::v8i32:
- case MVT::v16i16:
- if (!Subtarget->hasAVX())
- return SDValue();
- if (!Subtarget->hasAVX2()) {
- // needs to be split
- int NumElems = VT.getVectorNumElements();
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- EVT ExtraEltVT = ExtraVT.getVectorElementType();
- int ExtraNumElems = ExtraVT.getVectorNumElements();
- ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
- ExtraNumElems/2);
- SDValue Extra = DAG.getValueType(ExtraVT);
-
- LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
- LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
- }
- if (VT == MVT::v8i32) {
- SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d;
- SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d;
- } else {
- SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w;
- SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w;
- }
+ if (!Subtarget->hasAVX2()) {
+ // needs to be split
+ int NumElems = VT.getVectorNumElements();
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ EVT ExtraEltVT = ExtraVT.getVectorElementType();
+ int ExtraNumElems = ExtraVT.getVectorNumElements();
+ ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+ ExtraNumElems/2);
+ SDValue Extra = DAG.getValueType(ExtraVT);
+
+ LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+ LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+ }
+ // fall through
+ case MVT::v4i32:
+ case MVT::v8i16: {
+ SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
+ Op.getOperand(0), ShAmt, DAG);
+ return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
}
-
- SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(SHLIntrinsicsID, MVT::i32),
- Op.getOperand(0), ShAmt);
-
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(SRAIntrinsicsID, MVT::i32),
- Tmp1, ShAmt);
}
-
- return SDValue();
}
@@ -10402,7 +10524,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
// Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
// There isn't any reason to disable it if the target processor supports it.
- if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
+ if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
SDValue Chain = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = {
@@ -10456,7 +10578,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
// no-sse2). There isn't any reason to disable it if the target processor
// supports it.
- if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+ if (Subtarget->hasSSE2() || Subtarget->is64Bit())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
@@ -10487,8 +10609,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
unsigned Reg = 0;
unsigned size = 0;
switch(T.getSimpleVT().SimpleTy) {
- default:
- assert(false && "Invalid value type!");
+ default: llvm_unreachable("Invalid value type!");
case MVT::i8: Reg = X86::AL; size = 1; break;
case MVT::i16: Reg = X86::AX; size = 2; break;
case MVT::i32: Reg = X86::EAX; size = 4; break;
@@ -10536,7 +10657,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
SelectionDAG &DAG) const {
EVT SrcVT = Op.getOperand(0).getValueType();
EVT DstVT = Op.getValueType();
- assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
+ assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -10606,7 +10727,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
unsigned Opc;
bool ExtraOp = false;
switch (Op.getOpcode()) {
- default: assert(0 && "Invalid code");
+ default: llvm_unreachable("Invalid code");
case ISD::ADDC: Opc = X86ISD::ADD; break;
case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
case ISD::SUBC: Opc = X86ISD::SUB; break;
@@ -10673,6 +10794,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SRA:
@@ -10747,8 +10869,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DebugLoc dl = N->getDebugLoc();
switch (N->getOpcode()) {
default:
- assert(false && "Do not know how to custom type legalize this operation!");
- return;
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
@@ -10756,16 +10877,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SUBE:
// We don't want to expand or promote these.
return;
- case ISD::FP_TO_SINT: {
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: {
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+ if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
+ return;
+
std::pair<SDValue,SDValue> Vals =
- FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+ FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
if (FIST.getNode() != 0) {
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
- Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
- MachinePointerInfo(),
- false, false, false, 0));
+ if (StackSlot.getNode() != 0)
+ Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+ MachinePointerInfo(),
+ false, false, false, 0));
+ else
+ Results.push_back(FIST);
}
return;
}
@@ -10924,18 +11054,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
+ case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
case X86ISD::VSRL: return "X86ISD::VSRL";
- case X86ISD::CMPPD: return "X86ISD::CMPPD";
- case X86ISD::CMPPS: return "X86ISD::CMPPS";
- case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
- case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
- case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
- case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
- case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
- case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
- case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
- case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
+ case X86ISD::VSRA: return "X86ISD::VSRA";
+ case X86ISD::VSHLI: return "X86ISD::VSHLI";
+ case X86ISD::VSRLI: return "X86ISD::VSRLI";
+ case X86ISD::VSRAI: return "X86ISD::VSRAI";
+ case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
+ case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
case X86ISD::ADC: return "X86ISD::ADC";
@@ -10957,22 +11086,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PALIGN: return "X86ISD::PALIGN";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
- case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD";
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
- case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD";
- case X86ISD::SHUFPS: return "X86ISD::SHUFPS";
- case X86ISD::SHUFPD: return "X86ISD::SHUFPD";
+ case X86ISD::SHUFP: return "X86ISD::SHUFP";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
- case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD";
case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
- case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD";
- case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD";
case X86ISD::MOVSD: return "X86ISD::MOVSD";
case X86ISD::MOVSS: return "X86ISD::MOVSS";
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
@@ -10980,11 +11103,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
+ case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
+ case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
}
}
@@ -11093,15 +11218,15 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
return (VT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, VT) ||
- isSHUFPMask(M, VT) ||
+ isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
isPSHUFDMask(M, VT) ||
isPSHUFHWMask(M, VT) ||
isPSHUFLWMask(M, VT) ||
- isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
+ isPALIGNRMask(M, VT, Subtarget) ||
isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
- isUNPCKL_v_undef_Mask(M, VT) ||
- isUNPCKH_v_undef_Mask(M, VT));
+ isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
+ isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
}
bool
@@ -11114,8 +11239,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
if (NumElts == 4 && VT.getSizeInBits() == 128) {
return (isMOVLMask(Mask, VT) ||
isCommutedMOVLMask(Mask, VT, true) ||
- isSHUFPMask(Mask, VT) ||
- isSHUFPMask(Mask, VT, /* Commuted */ true));
+ isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
+ isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
}
return false;
}
@@ -11134,7 +11259,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
unsigned CXchgOpc,
unsigned notOpc,
unsigned EAXreg,
- TargetRegisterClass *RC,
+ const TargetRegisterClass *RC,
bool invSrc) const {
// For the atomic bitwise operator, we generate
// thisMBB:
@@ -11506,7 +11631,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
MachineBasicBlock *
X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
unsigned numArgs, bool memArg) const {
- assert(Subtarget->hasSSE42orAVX() &&
+ assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
DebugLoc dl = MI->getDebugLoc();
@@ -11911,6 +12036,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
return EndMBB;
}
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return false;
+ if (mi.definesRegister(X86::EFLAGS))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ if (miI == BB->end()) {
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+ }
+
+ // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+ // out. SelectMI should have a kill flag on EFLAGS.
+ SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+ return true;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
@@ -11940,7 +12101,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- if (!MI->killsRegister(X86::EFLAGS)) {
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ if (!MI->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
sinkMBB->addLiveIn(X86::EFLAGS);
}
@@ -12038,7 +12201,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
.addReg(tmpSPVReg).addReg(sizeVReg);
BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
- .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
@@ -12051,17 +12214,23 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
+ const uint32_t *RegMask =
+ getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Is64Bit) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+ .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+ .addRegMask(RegMask)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
.addImm(12);
BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space");
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
}
if (!Is64Bit)
@@ -12159,6 +12328,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
assert(MI->getOperand(3).isGlobal() && "This should be a global");
+ // Get a register mask for the lowered call.
+ // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+ // proper register mask.
+ const uint32_t *RegMask =
+ getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
@@ -12169,6 +12343,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
addDirectMem(MIB, X86::RDI);
+ MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV32rm), X86::EAX)
@@ -12179,6 +12354,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV32rm), X86::EAX)
@@ -12189,6 +12365,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
+ MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
}
MI->eraseFromParent(); // The pseudo instruction is gone now.
@@ -12199,30 +12376,14 @@ MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {
- default: assert(0 && "Unexpected instr type to insert");
+ default: llvm_unreachable("Unexpected instr type to insert");
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
- assert(0 && "TAILJMP64 would not be touched here.");
+ llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::TCRETURNmi64:
- // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset.
- // On AMD64, additional defs should be added before register allocation.
- if (!Subtarget->isTargetWin64()) {
- MI->addRegisterDefined(X86::RSI);
- MI->addRegisterDefined(X86::RDI);
- MI->addRegisterDefined(X86::XMM6);
- MI->addRegisterDefined(X86::XMM7);
- MI->addRegisterDefined(X86::XMM8);
- MI->addRegisterDefined(X86::XMM9);
- MI->addRegisterDefined(X86::XMM10);
- MI->addRegisterDefined(X86::XMM11);
- MI->addRegisterDefined(X86::XMM12);
- MI->addRegisterDefined(X86::XMM13);
- MI->addRegisterDefined(X86::XMM14);
- MI->addRegisterDefined(X86::XMM15);
- }
return BB;
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
@@ -12572,15 +12733,18 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_avx_movmsk_pd_256:
case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse2_pmovmskb_128: {
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx2_pmovmskb: {
// High bits of movmskp{s|d}, pmovmskb are known zero.
switch (IntId) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
+ case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
}
KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
Mask.getBitWidth() - NumLoBits);
@@ -12651,7 +12815,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget* Subtarget) {
DebugLoc dl = N->getDebugLoc();
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
SDValue V1 = SVOp->getOperand(0);
@@ -12687,9 +12852,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
!isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
return SDValue();
+ // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+ if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+ SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+ SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+ Ld->getMemoryVT(),
+ Ld->getPointerInfo(),
+ Ld->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+ return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ }
+
// Emit a zeroed vector and insert the desired subvector on its
// first half.
- SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
DAG.getConstant(0, MVT::i32), DAG, dl);
return DCI.CombineTo(N, InsV);
@@ -12734,7 +12913,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
N->getOpcode() == ISD::VECTOR_SHUFFLE)
- return PerformShuffleCombine256(N, DAG, DCI);
+ return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
// Only handle 128 wide vector from here on.
if (VT.getSizeInBits() != 128)
@@ -12750,6 +12929,82 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
}
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+ DAGCombinerInfo &DCI) const {
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!Subtarget->hasAVX()) return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ EVT OpVT = Op.getValueType();
+ DebugLoc dl = N->getDebugLoc();
+
+ if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+ DAG.getIntPtrConstant(0));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+ DAG.getIntPtrConstant(2));
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+ // PSHUFD
+ int ShufMask1[] = {0, 2, 0, 0};
+
+ OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
+ ShufMask1);
+ OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
+ ShufMask1);
+
+ // MOVLHPS
+ int ShufMask2[] = {0, 1, 4, 5};
+
+ return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
+ }
+ if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+ DAG.getIntPtrConstant(0));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+ DAG.getIntPtrConstant(4));
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
+
+ // PSHUFB
+ int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
+ DAG.getUNDEF(MVT::v16i8),
+ ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
+ DAG.getUNDEF(MVT::v16i8),
+ ShufMask1);
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+ // MOVLHPS
+ int ShufMask2[] = {0, 1, 4, 5};
+
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
+ }
+
+ return SDValue();
+}
+
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
/// to a simple store and scalar loads to extract the elements.
@@ -12836,6 +13091,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
SDValue Cond = N->getOperand(0);
@@ -12850,7 +13106,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// ignored in unsafe-math mode).
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- (Subtarget->hasXMMInt() ||
+ (Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -13081,6 +13337,57 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // Canonicalize max and min:
+ // (x > y) ? x : y -> (x >= y) ? x : y
+ // (x < y) ? x : y -> (x <= y) ? x : y
+ // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+ // the need for an extra compare
+ // against zero. e.g.
+ // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+ // subl %esi, %edi
+ // testl %edi, %edi
+ // movl $0, %eax
+ // cmovgl %edi, %eax
+ // =>
+ // xorl %eax, %eax
+ // subl %esi, $edi
+ // cmovsl %eax, %edi
+ if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ switch (CC) {
+ default: break;
+ case ISD::SETLT:
+ case ISD::SETGT: {
+ ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+ Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ }
+ }
+ }
+
+ // If we know that this node is legal then we know that it is going to be
+ // matched by one of the SSE/AVX BLEND instructions. These instructions only
+ // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+ // to simplify previous instructions.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+ !DCI.isBeforeLegalize() &&
+ TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+ assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+ APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+
return SDValue();
}
@@ -13314,6 +13621,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
/// when possible.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (N->getOpcode() == ISD::SHL) {
@@ -13325,7 +13633,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
// all elements are shifted by the same amount. We can't do this in legalize
// because the a constant vector is typically transformed to a constant pool
// so we have no knowledge of the shift amount.
- if (!Subtarget->hasXMMInt())
+ if (!Subtarget->hasSSE2())
return SDValue();
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
@@ -13346,6 +13654,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
BaseShAmt = Arg;
break;
}
+ // Handle the case where the build_vector is all undef
+ // FIXME: Should DAG allow this?
+ if (i == NumElts)
+ return SDValue();
+
for (; i != NumElts; ++i) {
SDValue Arg = ShAmtOp.getOperand(i);
if (Arg.getOpcode() == ISD::UNDEF) continue;
@@ -13372,9 +13685,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
BaseShAmt = InVec.getOperand(1);
}
}
- if (BaseShAmt.getNode() == 0)
+ if (BaseShAmt.getNode() == 0) {
+ // Don't create instructions with illegal types after legalize
+ // types has run.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
+ !DCI.isBeforeLegalize())
+ return SDValue();
+
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
DAG.getIntPtrConstant(0));
+ }
} else
return SDValue();
@@ -13389,79 +13709,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
switch (N->getOpcode()) {
default:
llvm_unreachable("Unknown shift opcode!");
- break;
case ISD::SHL:
- if (VT == MVT::v2i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v4i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v16i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
- ValOp, BaseShAmt);
- break;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
+ }
case ISD::SRA:
- if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v16i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
- ValOp, BaseShAmt);
- break;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
+ }
case ISD::SRL:
- if (VT == MVT::v2i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v4i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v4i64)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v8i32)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
- ValOp, BaseShAmt);
- if (VT == MVT::v16i16)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
- ValOp, BaseShAmt);
- break;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return SDValue();
+ case MVT::v2i64:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ case MVT::v16i16:
+ return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
+ }
}
- return SDValue();
}
@@ -13475,7 +13754,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
- if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0->getOperand(1);
@@ -13666,14 +13945,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// look for psign/blend
if (VT == MVT::v2i64 || VT == MVT::v4i64) {
- if (!Subtarget->hasSSSE3orAVX() ||
+ if (!Subtarget->hasSSSE3() ||
(VT == MVT::v4i64 && !Subtarget->hasAVX2()))
return SDValue();
// Canonicalize pandn to RHS
if (N0.getOpcode() == X86ISD::ANDNP)
std::swap(N0, N1);
- // or (and (m, x), (pandn m, y))
+ // or (and (m, y), (pandn m, x))
if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
SDValue Mask = N1.getOperand(0);
SDValue X = N1.getOperand(1);
@@ -13697,24 +13976,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
Mask = Mask.getOperand(0);
EVT MaskVT = Mask.getValueType();
- // Validate that the Mask operand is a vector sra node. The sra node
- // will be an intrinsic.
- if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
- return SDValue();
-
+ // Validate that the Mask operand is a vector sra node.
// FIXME: what to do for bytes, since there is a psignb/pblendvb, but
// there is no psrai.b
- switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- break;
- default: return SDValue();
- }
+ if (Mask.getOpcode() != X86ISD::VSRAI)
+ return SDValue();
// Check that the SRA is all signbits.
- SDValue SraC = Mask.getOperand(2);
+ SDValue SraC = Mask.getOperand(1);
unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
if ((SraAmt + 1) != EltBits)
@@ -13729,14 +13998,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
Y = Y.getOperand(0);
if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
- X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() &&
- (EltBits == 8 || EltBits == 16 || EltBits == 32)) {
- SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X,
- Mask.getOperand(1));
- return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
+ X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Unsupported VT for PSIGN");
+ Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
}
// PBLENDVB only available on SSE 4.1
- if (!Subtarget->hasSSE41orAVX())
+ if (!Subtarget->hasSSE41())
return SDValue();
EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
@@ -13807,6 +14076,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -13818,6 +14088,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
+ assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
+
// Create BLSMSK instructions by finding X ^ (X-1)
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -13849,7 +14121,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// shuffle. We need SSE4 for the shuffles.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
- if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+ if (RegVT.isVector() && RegVT.isInteger() &&
+ Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
@@ -13896,7 +14169,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
- SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
+ ScalarInVector);
unsigned SizeRatio = RegSz/MemSz;
// Redistribute the loaded elements into the different locations.
@@ -14039,7 +14313,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
- && Subtarget->hasXMMInt();
+ && Subtarget->hasSSE2();
if ((VT.isVector() ||
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
@@ -14057,7 +14331,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
Ld = cast<LoadSDNode>(St->getChain());
else if (St->getValue().hasOneUse() &&
ChainVal->getOpcode() == ISD::TokenFactor) {
- for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
if (ChainVal->getOperand(i).getNode() == LdVal) {
TokenFactorIndex = i;
Ld = cast<LoadSDNode>(St->getValue());
@@ -14196,7 +14470,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
A = LHS.getOperand(0);
if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
B = LHS.getOperand(1);
- cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), LMask.begin());
} else {
if (LHS.getOpcode() != ISD::UNDEF)
A = LHS;
@@ -14213,7 +14488,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
C = RHS.getOperand(0);
if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
D = RHS.getOperand(1);
- cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+ std::copy(Mask.begin(), Mask.end(), RMask.begin());
} else {
if (RHS.getOpcode() != ISD::UNDEF)
C = RHS;
@@ -14270,7 +14546,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
SDValue RHS = N->getOperand(1);
// Try to synthesize horizontal adds from adds of shuffles.
- if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, true))
return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
@@ -14285,7 +14561,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
SDValue RHS = N->getOperand(1);
// Try to synthesize horizontal subs from subs of shuffles.
- if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, false))
return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
@@ -14352,7 +14628,58 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ EVT OpVT = Op.getValueType();
+ DebugLoc dl = N->getDebugLoc();
+
+ if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
+ (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
+
+ unsigned NumElems = OpVT.getVectorNumElements();
+ SmallVector<int,8> ShufMask1(NumElems, -1);
+ for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+
+ SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+ ShufMask1.data());
+
+ SmallVector<int,8> ShufMask2(NumElems, -1);
+ for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+ ShufMask2.data());
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+ }
+ return SDValue();
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)
// This eliminates the zext. This transformation is necessary because
@@ -14360,6 +14687,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
DebugLoc dl = N->getDebugLoc();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ EVT OpVT = N0.getValueType();
+
if (N0.getOpcode() == ISD::AND &&
N0.hasOneUse() &&
N0.getOperand(0).hasOneUse()) {
@@ -14374,6 +14703,37 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
N00.getOperand(0), N00.getOperand(1)),
DAG.getConstant(1, VT));
}
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+ if (Subtarget->hasAVX()) {
+
+ if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
+ ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) {
+
+ SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+ SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
+ SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+ }
+ }
+
return SDValue();
}
@@ -14490,8 +14850,8 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
// Try to synthesize horizontal adds from adds of shuffles.
- if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
- (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
@@ -14523,7 +14883,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
// Try to synthesize horizontal adds from adds of shuffles.
EVT VT = N->getValueType(0);
- if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+ if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
(Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
@@ -14539,7 +14899,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
case ISD::VSELECT:
- case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
+ case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
@@ -14547,7 +14907,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
case ISD::SHL:
case ISD::SRA:
- case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
+ case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
@@ -14561,10 +14921,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
- case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG);
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget);
+ case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
- case X86ISD::SHUFPS: // Handle all target specific shuffles
- case X86ISD::SHUFPD:
+ case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGN:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
@@ -14684,11 +15045,38 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
+namespace {
+ // Helper to match a string separated by whitespace.
+ bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
+ s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+ for (unsigned i = 0, e = args.size(); i != e; ++i) {
+ StringRef piece(*args[i]);
+ if (!s.startswith(piece)) // Check if the piece matches.
+ return false;
+
+ s = s.substr(piece.size());
+ StringRef::size_type pos = s.find_first_not_of(" \t");
+ if (pos == 0) // We matched a prefix.
+ return false;
+
+ s = s.substr(pos);
+ }
+
+ return s.empty();
+ }
+ const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+}
+
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
std::string AsmStr = IA->getAsmString();
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
@@ -14696,35 +15084,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
switch (AsmPieces.size()) {
default: return false;
case 1:
- AsmStr = AsmPieces[0];
- AsmPieces.clear();
- SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
-
// FIXME: this should verify that we are targeting a 486 or better. If not,
- // we will turn this bswap into something that will be lowered to logical ops
- // instead of emitting the bswap asm. For now, we don't support 486 or lower
- // so don't worry about this.
+ // we will turn this bswap into something that will be lowered to logical
+ // ops instead of emitting the bswap asm. For now, we don't support 486 or
+ // lower so don't worry about this.
// bswap $0
- if (AsmPieces.size() == 2 &&
- (AsmPieces[0] == "bswap" ||
- AsmPieces[0] == "bswapq" ||
- AsmPieces[0] == "bswapl") &&
- (AsmPieces[1] == "$0" ||
- AsmPieces[1] == "${0:q}")) {
+ if (matchAsm(AsmPieces[0], "bswap", "$0") ||
+ matchAsm(AsmPieces[0], "bswapl", "$0") ||
+ matchAsm(AsmPieces[0], "bswapq", "$0") ||
+ matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
+ matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
+ matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
return IntrinsicLowering::LowerToByteSwap(CI);
}
+
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
- AsmPieces.size() == 3 &&
- (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
- AsmPieces[1] == "$$8," &&
- AsmPieces[2] == "${0:w}" &&
- IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
+ matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -14733,46 +15113,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces[0] == "~{cc}" &&
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
break;
case 3:
if (CI->getType()->isIntegerTy(32) &&
- IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
- SmallVector<StringRef, 4> Words;
- SplitString(AsmPieces[0], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
- Words[2] == "${0:w}") {
- Words.clear();
- SplitString(AsmPieces[1], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
- Words[2] == "$0") {
- Words.clear();
- SplitString(AsmPieces[2], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
- Words[2] == "${0:w}") {
- AsmPieces.clear();
- const std::string &ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- std::sort(AsmPieces.begin(), AsmPieces.end());
- if (AsmPieces.size() == 4 &&
- AsmPieces[0] == "~{cc}" &&
- AsmPieces[1] == "~{dirflag}" &&
- AsmPieces[2] == "~{flags}" &&
- AsmPieces[3] == "~{fpsr}") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- }
- }
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+ matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
+ matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
+ matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+ AsmPieces.clear();
+ const std::string &ConstraintsStr = IA->getConstraintString();
+ SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+ std::sort(AsmPieces.begin(), AsmPieces.end());
+ if (AsmPieces.size() == 4 &&
+ AsmPieces[0] == "~{cc}" &&
+ AsmPieces[1] == "~{dirflag}" &&
+ AsmPieces[2] == "~{flags}" &&
+ AsmPieces[3] == "~{fpsr}")
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
if (CI->getType()->isIntegerTy(64)) {
@@ -14781,23 +15141,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- SmallVector<StringRef, 4> Words;
- SplitString(AsmPieces[0], Words, " \t");
- if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
- Words.clear();
- SplitString(AsmPieces[1], Words, " \t");
- if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
- Words.clear();
- SplitString(AsmPieces[2], Words, " \t,");
- if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
- Words[2] == "%edx") {
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- }
+ if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
+ matchAsm(AsmPieces[1], "bswap", "%edx") &&
+ matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+ return IntrinsicLowering::LowerToByteSwap(CI);
}
}
break;
@@ -14892,7 +15239,8 @@ TargetLowering::ConstraintWeight
break;
case 'x':
case 'Y':
- if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
weight = CW_Register;
break;
case 'I':
@@ -14962,9 +15310,9 @@ LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
- if (Subtarget->hasXMMInt())
+ if (Subtarget->hasSSE2())
return "Y";
- if (Subtarget->hasXMM())
+ if (Subtarget->hasSSE1())
return "x";
}
@@ -15170,10 +15518,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
if (!Subtarget->hasMMX()) break;
return std::make_pair(0U, X86::VR64RegisterClass);
case 'Y': // SSE_REGS if SSE2 allowed
- if (!Subtarget->hasXMMInt()) break;
+ if (!Subtarget->hasSSE2()) break;
// FALL THROUGH.
- case 'x': // SSE_REGS if SSE1 allowed
- if (!Subtarget->hasXMM()) break;
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget->hasSSE1()) break;
switch (VT.getSimpleVT().SimpleTy) {
default: break;
@@ -15192,6 +15540,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v4f32:
case MVT::v2f64:
return std::make_pair(0U, X86::VR128RegisterClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ return std::make_pair(0U, X86::VR256RegisterClass);
+
}
break;
}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index cfc1f88..0327b1f 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -219,16 +219,26 @@ namespace llvm {
// VZEXT_MOVL - Vector move low and zero extend.
VZEXT_MOVL,
- // VSHL, VSRL - Vector logical left / right shift.
- VSHL, VSRL,
+ // VSEXT_MOVL - Vector move low and sign extend.
+ VSEXT_MOVL,
- // CMPPD, CMPPS - Vector double/float comparison.
- // CMPPD, CMPPS - Vector double/float comparison.
- CMPPD, CMPPS,
+ // VSHL, VSRL - 128-bit vector logical left / right shift
+ VSHLDQ, VSRLDQ,
+
+ // VSHL, VSRL, VSRA - Vector shift elements
+ VSHL, VSRL, VSRA,
+
+ // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate
+ VSHLI, VSRLI, VSRAI,
+
+ // CMPP - Vector packed double/float comparison.
+ CMPP,
// PCMP* - Vector integer comparisons.
- PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ,
- PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ,
+ PCMPEQ, PCMPGT,
+
+ // VPCOM, VPCOMU - XOP Vector integer comparisons.
+ VPCOM, VPCOMU,
// ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
ADD, SUB, ADC, SBB, SMUL,
@@ -256,19 +266,13 @@ namespace llvm {
PSHUFD,
PSHUFHW,
PSHUFLW,
- PSHUFHW_LD,
- PSHUFLW_LD,
- SHUFPD,
- SHUFPS,
+ SHUFP,
MOVDDUP,
MOVSHDUP,
MOVSLDUP,
- MOVSHDUP_LD,
- MOVSLDUP_LD,
MOVLHPS,
MOVLHPD,
MOVHLPS,
- MOVHLPD,
MOVLPS,
MOVLPD,
MOVSD,
@@ -279,6 +283,9 @@ namespace llvm {
VPERM2X128,
VBROADCAST,
+ // PMULUDQ - Vector multiply packed unsigned doubleword integers
+ PMULUDQ,
+
// VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
// according to %al. An operator is needed so that this can be expanded
// with control flow.
@@ -292,6 +299,9 @@ namespace llvm {
// falls back to heap allocation if not.
SEG_ALLOCA,
+ // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui.
+ WIN_FTOL,
+
// Memory barrier
MEMBARRIER,
MFENCE,
@@ -361,77 +371,6 @@ namespace llvm {
/// Define some predicates that are used for node matching.
namespace X86 {
- /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to PSHUFD.
- bool isPSHUFDMask(ShuffleVectorSDNode *N);
-
- /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to PSHUFD.
- bool isPSHUFHWMask(ShuffleVectorSDNode *N);
-
- /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to PSHUFD.
- bool isPSHUFLWMask(ShuffleVectorSDNode *N);
-
- /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to SHUFP*.
- bool isSHUFPMask(ShuffleVectorSDNode *N);
-
- /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
- bool isMOVHLPSMask(ShuffleVectorSDNode *N);
-
- /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
- /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
- /// <2, 3, 2, 3>
- bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N);
-
- /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for MOVLP{S|D}.
- bool isMOVLPMask(ShuffleVectorSDNode *N);
-
- /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for MOVHP{S|D}.
- /// as well as MOVLHPS.
- bool isMOVLHPSMask(ShuffleVectorSDNode *N);
-
- /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to UNPCKL.
- bool isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2,
- bool V2IsSplat = false);
-
- /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to UNPCKH.
- bool isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2,
- bool V2IsSplat = false);
-
- /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
- /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
- /// <0, 0, 1, 1>
- bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N);
-
- /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
- /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
- /// <2, 2, 3, 3>
- bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N);
-
- /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to MOVSS,
- /// MOVSD, and MOVD, i.e. setting the lowest element.
- bool isMOVLMask(ShuffleVectorSDNode *N);
-
- /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
- bool isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
-
- /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
- bool isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
-
- /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
- /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
- bool isMOVDDUPMask(ShuffleVectorSDNode *N);
-
/// isVEXTRACTF128Index - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for input to VEXTRACTF128.
@@ -442,19 +381,6 @@ namespace llvm {
/// suitable for input to VINSERTF128.
bool isVINSERTF128Index(SDNode *N);
- /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
- /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
- /// instructions.
- unsigned getShuffleSHUFImmediate(SDNode *N);
-
- /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
- /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction.
- unsigned getShufflePSHUFHWImmediate(SDNode *N);
-
- /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
- /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
- unsigned getShufflePSHUFLWImmediate(SDNode *N);
-
/// getExtractVEXTRACTF128Immediate - Return the appropriate
/// immediate to extract the specified EXTRACT_SUBVECTOR index
/// with VEXTRACTF128 instructions.
@@ -688,6 +614,18 @@ namespace llvm {
(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
}
+ /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
+ /// for fptoui.
+ bool isTargetFTOL() const {
+ return Subtarget->isTargetWindows() && !Subtarget->is64Bit();
+ }
+
+ /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
+ /// used for fptoui to the given type.
+ bool isIntegerTypeFTOL(EVT VT) const {
+ return isTargetFTOL() && VT == MVT::i64;
+ }
+
/// createFastISel - This method returns a target specific FastISel object,
/// or null if the target does not support "fast" ISel.
virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
@@ -770,7 +708,8 @@ namespace llvm {
SelectionDAG &DAG) const;
std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool isSigned) const;
+ bool isSigned,
+ bool isReplace) const;
SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
SelectionDAG &DAG) const;
@@ -824,6 +763,7 @@ namespace llvm {
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
@@ -837,6 +777,7 @@ namespace llvm {
SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const;
// Utility functions to help LowerVECTOR_SHUFFLE
SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
@@ -848,8 +789,8 @@ namespace llvm {
DebugLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
virtual SDValue
- LowerCall(SDValue Chain, SDValue Callee,
- CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+ bool isVarArg, bool doesNotRet, bool &isTailCall,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -907,7 +848,7 @@ namespace llvm {
unsigned cxchgOpc,
unsigned notOpc,
unsigned EAXreg,
- TargetRegisterClass *RC,
+ const TargetRegisterClass *RC,
bool invSrc = false) const;
MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index dd4f6a5..54b91c3 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -1,4 +1,4 @@
-//====- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index c99c52d..7fa7499 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1,10 +1,10 @@
-//===- X86InstrArithmetic.td - Integer Arithmetic Instrs ---*- tablegen -*-===//
-//
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the integer arithmetic instructions in the X86
@@ -18,22 +18,24 @@
let neverHasSideEffects = 1 in
def LEA16r : I<0x8D, MRMSrcMem,
(outs GR16:$dst), (ins i32mem:$src),
- "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
+ "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize;
let isReMaterializable = 1 in
def LEA32r : I<0x8D, MRMSrcMem,
(outs GR32:$dst), (ins i32mem:$src),
"lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+ [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ Requires<[In32BitMode]>;
def LEA64_32r : I<0x8D, MRMSrcMem,
(outs GR32:$dst), (ins lea64_32mem:$src),
"lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+ [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ Requires<[In64BitMode]>;
let isReMaterializable = 1 in
def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"lea{q}\t{$src|$dst}, {$dst|$src}",
- [(set GR64:$dst, lea64addr:$src)]>;
+ [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
@@ -56,16 +58,18 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
"mul{w}\t$src",
- []>, OpSize; // AX,DX = AX*GR16
+ [], IIC_MUL16_REG>, OpSize; // AX,DX = AX*GR16
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
"mul{l}\t$src", // EAX,EDX = EAX*GR32
- [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>;
+ [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+ IIC_MUL32_REG>;
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
"mul{q}\t$src", // RAX,RDX = RAX*GR64
- [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>;
+ [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+ IIC_MUL64>;
let Defs = [AL,EFLAGS,AX], Uses = [AL] in
def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
@@ -74,21 +78,21 @@ def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, (loadi8 addr:$src))),
- (implicit EFLAGS)]>; // AL,AH = AL*[mem8]
+ (implicit EFLAGS)], IIC_MUL8>; // AL,AH = AL*[mem8]
let mayLoad = 1, neverHasSideEffects = 1 in {
let Defs = [AX,DX,EFLAGS], Uses = [AX] in
def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
"mul{w}\t$src",
- []>, OpSize; // AX,DX = AX*[mem16]
+ [], IIC_MUL16_MEM>, OpSize; // AX,DX = AX*[mem16]
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
"mul{l}\t$src",
- []>; // EAX,EDX = EAX*[mem32]
+ [], IIC_MUL32_MEM>; // EAX,EDX = EAX*[mem32]
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
- "mul{q}\t$src", []>; // RAX,RDX = RAX*[mem64]
+ "mul{q}\t$src", [], IIC_MUL64>; // RAX,RDX = RAX*[mem64]
}
let neverHasSideEffects = 1 in {
@@ -130,16 +134,19 @@ let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y
def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
"imul{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize;
+ (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+ TB, OpSize;
def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
"imul{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, GR32:$src2))]>, TB;
+ (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+ TB;
def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"imul{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, GR64:$src2))]>, TB;
+ (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+ TB;
}
// Register-Memory Signed Integer Multiply
@@ -147,18 +154,23 @@ def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
(ins GR16:$src1, i16mem:$src2),
"imul{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, (load addr:$src2)))]>,
+ (X86smul_flag GR16:$src1, (load addr:$src2)))],
+ IIC_IMUL16_RM>,
TB, OpSize;
def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i32mem:$src2),
"imul{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB;
+ (X86smul_flag GR32:$src1, (load addr:$src2)))],
+ IIC_IMUL32_RM>,
+ TB;
def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i64mem:$src2),
"imul{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB;
+ (X86smul_flag GR64:$src1, (load addr:$src2)))],
+ IIC_IMUL64_RM>,
+ TB;
} // Constraints = "$src1 = $dst"
} // Defs = [EFLAGS]
@@ -170,33 +182,39 @@ def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize;
+ (X86smul_flag GR16:$src1, imm:$src2))],
+ IIC_IMUL16_RRI>, OpSize;
def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+ IIC_IMUL16_RRI>,
OpSize;
def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
(outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, imm:$src2))]>;
+ (X86smul_flag GR32:$src1, imm:$src2))],
+ IIC_IMUL32_RRI>;
def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
(outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>;
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+ IIC_IMUL32_RRI>;
def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32
(outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>;
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+ IIC_IMUL64_RRI>;
def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
(outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
- (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>;
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+ IIC_IMUL64_RRI>;
// Memory-Integer Signed Integer Multiply
@@ -204,37 +222,43 @@ def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
(outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1), imm:$src2))]>,
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL16_RMI>,
OpSize;
def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
(outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR16:$dst, EFLAGS,
(X86smul_flag (load addr:$src1),
- i16immSExt8:$src2))]>, OpSize;
+ i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+ OpSize;
def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
(outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
- (X86smul_flag (load addr:$src1), imm:$src2))]>;
+ (X86smul_flag (load addr:$src1), imm:$src2))],
+ IIC_IMUL32_RMI>;
def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
(outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
"imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, EFLAGS,
(X86smul_flag (load addr:$src1),
- i32immSExt8:$src2))]>;
+ i32immSExt8:$src2))],
+ IIC_IMUL32_RMI>;
def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32
(outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
(X86smul_flag (load addr:$src1),
- i64immSExt32:$src2))]>;
+ i64immSExt32:$src2))],
+ IIC_IMUL64_RMI>;
def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
(outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
"imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR64:$dst, EFLAGS,
(X86smul_flag (load addr:$src1),
- i64immSExt8:$src2))]>;
+ i64immSExt8:$src2))],
+ IIC_IMUL64_RMI>;
} // Defs = [EFLAGS]
@@ -243,62 +267,62 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
// unsigned division/remainder
let Defs = [AL,EFLAGS,AX], Uses = [AX] in
def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
- "div{b}\t$src", []>;
+ "div{b}\t$src", [], IIC_DIV8_REG>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
- "div{w}\t$src", []>, OpSize;
+ "div{w}\t$src", [], IIC_DIV16>, OpSize;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
- "div{l}\t$src", []>;
+ "div{l}\t$src", [], IIC_DIV32>;
// RDX:RAX/r64 = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
- "div{q}\t$src", []>;
+ "div{q}\t$src", [], IIC_DIV64>;
let mayLoad = 1 in {
let Defs = [AL,EFLAGS,AX], Uses = [AX] in
def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
- "div{b}\t$src", []>;
+ "div{b}\t$src", [], IIC_DIV8_MEM>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
- "div{w}\t$src", []>, OpSize;
+ "div{w}\t$src", [], IIC_DIV16>, OpSize;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
- "div{l}\t$src", []>;
+ "div{l}\t$src", [], IIC_DIV32>;
// RDX:RAX/[mem64] = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
- "div{q}\t$src", []>;
+ "div{q}\t$src", [], IIC_DIV64>;
}
// Signed division/remainder.
let Defs = [AL,EFLAGS,AX], Uses = [AX] in
def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
- "idiv{b}\t$src", []>;
+ "idiv{b}\t$src", [], IIC_IDIV8>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
- "idiv{w}\t$src", []>, OpSize;
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
- "idiv{l}\t$src", []>;
+ "idiv{l}\t$src", [], IIC_IDIV32>;
// RDX:RAX/r64 = RAX,RDX
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
- "idiv{q}\t$src", []>;
+ "idiv{q}\t$src", [], IIC_IDIV64>;
let mayLoad = 1 in {
let Defs = [AL,EFLAGS,AX], Uses = [AX] in
def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
- "idiv{b}\t$src", []>;
+ "idiv{b}\t$src", [], IIC_IDIV8>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
- "idiv{w}\t$src", []>, OpSize;
+ "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
- "idiv{l}\t$src", []>;
+ "idiv{l}\t$src", [], IIC_IDIV32>;
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
- "idiv{q}\t$src", []>;
+ "idiv{q}\t$src", [], IIC_IDIV64>;
}
//===----------------------------------------------------------------------===//
@@ -312,35 +336,35 @@ let Constraints = "$src1 = $dst" in {
def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
"neg{b}\t$dst",
[(set GR8:$dst, (ineg GR8:$src1)),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_REG>;
def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
"neg{w}\t$dst",
[(set GR16:$dst, (ineg GR16:$src1)),
- (implicit EFLAGS)]>, OpSize;
+ (implicit EFLAGS)], IIC_UNARY_REG>, OpSize;
def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
"neg{l}\t$dst",
[(set GR32:$dst, (ineg GR32:$src1)),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_REG>;
def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
[(set GR64:$dst, (ineg GR64:$src1)),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_REG>;
} // Constraints = "$src1 = $dst"
def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
"neg{b}\t$dst",
[(store (ineg (loadi8 addr:$dst)), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
"neg{w}\t$dst",
[(store (ineg (loadi16 addr:$dst)), addr:$dst),
- (implicit EFLAGS)]>, OpSize;
+ (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize;
def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
"neg{l}\t$dst",
[(store (ineg (loadi32 addr:$dst)), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
[(store (ineg (loadi64 addr:$dst)), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
} // Defs = [EFLAGS]
@@ -351,29 +375,30 @@ let Constraints = "$src1 = $dst" in {
let AddedComplexity = 15 in {
def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
"not{b}\t$dst",
- [(set GR8:$dst, (not GR8:$src1))]>;
+ [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
"not{w}\t$dst",
- [(set GR16:$dst, (not GR16:$src1))]>, OpSize;
+ [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize;
def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
"not{l}\t$dst",
- [(set GR32:$dst, (not GR32:$src1))]>;
+ [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>;
def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
- [(set GR64:$dst, (not GR64:$src1))]>;
+ [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
}
} // Constraints = "$src1 = $dst"
def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
"not{b}\t$dst",
- [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+ [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
"not{w}\t$dst",
- [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+ [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ OpSize;
def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
"not{l}\t$dst",
- [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+ [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
- [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+ [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
} // CodeSize
// TODO: inc/dec is slow for P4, but fast for Pentium-M.
@@ -382,19 +407,22 @@ let Constraints = "$src1 = $dst" in {
let CodeSize = 2 in
def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"inc{b}\t$dst",
- [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>;
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+ IIC_UNARY_REG>;
let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>,
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
OpSize, Requires<[In32BitMode]>;
def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
"inc{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>,
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>,
Requires<[In32BitMode]>;
def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>;
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+ IIC_UNARY_REG>;
} // isConvertibleToThreeAddress = 1, CodeSize = 1
@@ -403,19 +431,23 @@ let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
// Can transform into LEA.
def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>,
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+ IIC_UNARY_REG>,
OpSize, Requires<[In64BitMode]>;
def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"inc{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>,
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+ IIC_UNARY_REG>,
Requires<[In64BitMode]>;
def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>,
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>,
OpSize, Requires<[In64BitMode]>;
def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"dec{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>,
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>,
Requires<[In64BitMode]>;
} // isConvertibleToThreeAddress = 1, CodeSize = 2
@@ -424,37 +456,37 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
let CodeSize = 2 in {
def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
[(store (add (loadi8 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
[(store (add (loadi16 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
OpSize, Requires<[In32BitMode]>;
def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
[(store (add (loadi32 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
Requires<[In32BitMode]>;
def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
[(store (add (loadi64 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
// how to unfold them.
// FIXME: What is this for??
def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
[(store (add (loadi16 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
OpSize, Requires<[In64BitMode]>;
def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
[(store (add (loadi32 addr:$dst), 1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
Requires<[In64BitMode]>;
def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
[(store (add (loadi16 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
OpSize, Requires<[In64BitMode]>;
def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
[(store (add (loadi32 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
Requires<[In64BitMode]>;
} // CodeSize = 2
@@ -462,18 +494,22 @@ let Constraints = "$src1 = $dst" in {
let CodeSize = 2 in
def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"dec{b}\t$dst",
- [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>;
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+ IIC_UNARY_REG>;
let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>,
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+ IIC_UNARY_REG>,
OpSize, Requires<[In32BitMode]>;
def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
"dec{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>,
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+ IIC_UNARY_REG>,
Requires<[In32BitMode]>;
def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>;
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+ IIC_UNARY_REG>;
} // CodeSize = 2
} // Constraints = "$src1 = $dst"
@@ -481,18 +517,18 @@ def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
let CodeSize = 2 in {
def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
[(store (add (loadi8 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
[(store (add (loadi16 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
OpSize, Requires<[In32BitMode]>;
def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
[(store (add (loadi32 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>,
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
Requires<[In32BitMode]>;
def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
[(store (add (loadi64 addr:$dst), -1), addr:$dst),
- (implicit EFLAGS)]>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>;
} // CodeSize = 2
} // Defs = [EFLAGS]
@@ -588,11 +624,13 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
/// or 1 (for i16,i32,i64 operations).
class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
- string mnemonic, string args, list<dag> pattern>
+ string mnemonic, string args, list<dag> pattern,
+ InstrItinClass itin = IIC_BIN_NONMEM>
: I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
f, outs, ins,
- !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
+ !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+ itin> {
// Infer instruction prefixes from type info.
let hasOpSizePrefix = typeinfo.HasOpSizePrefix;
@@ -664,7 +702,7 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
dag outlist, list<dag> pattern>
: ITy<opcode, MRMSrcMem, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
- mnemonic, "{$src2, $src1|$src1, $src2}", pattern>;
+ mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_MEM>;
// BinOpRM_R - Instructions like "add reg, reg, [mem]".
class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -776,7 +814,7 @@ class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
list<dag> pattern>
: ITy<opcode, MRMDestMem, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern>;
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>;
// BinOpMR_RMW - Instructions like "add [mem], reg".
class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -804,7 +842,7 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
Format f, list<dag> pattern, bits<8> opcode = 0x80>
: ITy<opcode, f, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern> {
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> {
let ImmT = typeinfo.ImmEncoding;
}
@@ -837,7 +875,7 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
Format f, list<dag> pattern>
: ITy<0x82, f, typeinfo,
(outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
- mnemonic, "{$src, $dst|$dst, $src}", pattern> {
+ mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> {
let ImmT = Imm8; // Always 8-bit immediate.
}
@@ -1150,7 +1188,7 @@ let Defs = [EFLAGS] in {
// register class is constrained to GR8_NOREX.
let isPseudo = 1 in
def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
- "", []>;
+ "", [], IIC_BIN_NONMEM>;
}
//===----------------------------------------------------------------------===//
@@ -1160,11 +1198,12 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
PatFrag ld_frag> {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))]>;
+ [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))],
+ IIC_BIN_NONMEM>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
- (X86andn_flag RC:$src1, (ld_frag addr:$src2)))]>;
+ (X86andn_flag RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 3a43b22..adeaf54 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -1,10 +1,10 @@
-//===- X86InstrCMovSetCC.td - Conditional Move and SetCC ---*- tablegen -*-===//
-//
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 conditional move and set on condition
@@ -21,17 +21,20 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
: I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,TB,OpSize;
+ (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+ IIC_CMOV16_RR>,TB,OpSize;
def #NAME#32rr
: I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>, TB;
+ (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
def #NAME#64rr
:RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB;
+ (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+ IIC_CMOV32_RR>, TB;
}
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in {
@@ -39,17 +42,18 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
: I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- CondNode, EFLAGS))]>, TB, OpSize;
+ CondNode, EFLAGS))], IIC_CMOV16_RM>,
+ TB, OpSize;
def #NAME#32rm
: I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- CondNode, EFLAGS))]>, TB;
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
def #NAME#64rm
:RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- CondNode, EFLAGS))]>, TB;
+ CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
} // end multiclass
@@ -78,10 +82,12 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
let Uses = [EFLAGS] in {
def r : I<opc, MRM0r, (outs GR8:$dst), (ins),
!strconcat(Mnemonic, "\t$dst"),
- [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>, TB;
+ [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+ IIC_SET_R>, TB;
def m : I<opc, MRM0m, (outs), (ins i8mem:$dst),
!strconcat(Mnemonic, "\t$dst"),
- [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>, TB;
+ [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+ IIC_SET_M>, TB;
} // Uses = [EFLAGS]
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index e0cf669..ac49232 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -125,10 +125,26 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
[(set GR64:$dst,
(X86SegAlloca GR64:$size))]>,
Requires<[In64BitMode]>;
-
}
+// The MSVC runtime contains an _ftol2 routine for converting floating-point
+// to integer values. It has a strange calling convention: the input is
+// popped from the x87 stack, and the return value is given in EDX:EAX. No
+// other registers (aside from flags) are touched.
+// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
+// variant is unnecessary.
+
+let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+ def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
+ "# win32 fptoui",
+ [(X86WinFTOL RFP32:$src)]>,
+ Requires<[In32BitMode]>;
+ def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
+ "# win32 fptoui",
+ [(X86WinFTOL RFP64:$src)]>,
+ Requires<[In32BitMode]>;
+}
//===----------------------------------------------------------------------===//
// EH Pseudo Instructions
@@ -551,7 +567,7 @@ def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
// Memory barriers
// TODO: Get this to fold the constant into the instruction.
-let isCodeGenOnly = 1 in
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
"lock\n\t"
"or{l}\t{$zero, $dst|$dst, $zero}",
@@ -562,15 +578,6 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
"#MEMBARRIER",
[(X86MemBarrier)]>;
-// TODO: Get this to fold the constant into the instruction.
-let hasSideEffects = 1, Defs = [ESP], isCodeGenOnly = 1 in
-def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero),
- "lock\n\t"
- "or{q}\t{$zero, (%rsp)|(%rsp), $zero}",
- [(X86MemBarrierNoSSE GR64:$zero)]>,
- Requires<[In64BitMode]>, LOCK;
-
-
// RegOpc corresponds to the mr version of the instruction
// ImmOpc corresponds to the mi version of the instruction
// ImmOpc8 corresponds to the mi8 version of the instruction
@@ -954,14 +961,9 @@ def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))),
// Direct PC relative function call for small code model. 32-bit displacement
// sign extended to 64-bit.
def : Pat<(X86call (i64 tglobaladdr:$dst)),
- (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>;
-def : Pat<(X86call (i64 texternalsym:$dst)),
- (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>;
-
-def : Pat<(X86call (i64 tglobaladdr:$dst)),
- (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>;
+ (CALL64pcrel32 tglobaladdr:$dst)>;
def : Pat<(X86call (i64 texternalsym:$dst)),
- (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>;
+ (CALL64pcrel32 texternalsym:$dst)>;
// tailcall stuff
def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
@@ -1458,58 +1460,62 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
+def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
+
// (shl x (and y, 31)) ==> (shl x, y)
-def : Pat<(shl GR8:$src1, (and CL, 31)),
+def : Pat<(shl GR8:$src1, (and CL, immShift32)),
(SHL8rCL GR8:$src1)>;
-def : Pat<(shl GR16:$src1, (and CL, 31)),
+def : Pat<(shl GR16:$src1, (and CL, immShift32)),
(SHL16rCL GR16:$src1)>;
-def : Pat<(shl GR32:$src1, (and CL, 31)),
+def : Pat<(shl GR32:$src1, (and CL, immShift32)),
(SHL32rCL GR32:$src1)>;
-def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHL8mCL addr:$dst)>;
-def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHL16mCL addr:$dst)>;
-def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHL32mCL addr:$dst)>;
-def : Pat<(srl GR8:$src1, (and CL, 31)),
+def : Pat<(srl GR8:$src1, (and CL, immShift32)),
(SHR8rCL GR8:$src1)>;
-def : Pat<(srl GR16:$src1, (and CL, 31)),
+def : Pat<(srl GR16:$src1, (and CL, immShift32)),
(SHR16rCL GR16:$src1)>;
-def : Pat<(srl GR32:$src1, (and CL, 31)),
+def : Pat<(srl GR32:$src1, (and CL, immShift32)),
(SHR32rCL GR32:$src1)>;
-def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHR8mCL addr:$dst)>;
-def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHR16mCL addr:$dst)>;
-def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
(SHR32mCL addr:$dst)>;
-def : Pat<(sra GR8:$src1, (and CL, 31)),
+def : Pat<(sra GR8:$src1, (and CL, immShift32)),
(SAR8rCL GR8:$src1)>;
-def : Pat<(sra GR16:$src1, (and CL, 31)),
+def : Pat<(sra GR16:$src1, (and CL, immShift32)),
(SAR16rCL GR16:$src1)>;
-def : Pat<(sra GR32:$src1, (and CL, 31)),
+def : Pat<(sra GR32:$src1, (and CL, immShift32)),
(SAR32rCL GR32:$src1)>;
-def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
(SAR8mCL addr:$dst)>;
-def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
(SAR16mCL addr:$dst)>;
-def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
(SAR32mCL addr:$dst)>;
// (shl x (and y, 63)) ==> (shl x, y)
-def : Pat<(shl GR64:$src1, (and CL, 63)),
+def : Pat<(shl GR64:$src1, (and CL, immShift64)),
(SHL64rCL GR64:$src1)>;
def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SHL64mCL addr:$dst)>;
-def : Pat<(srl GR64:$src1, (and CL, 63)),
+def : Pat<(srl GR64:$src1, (and CL, immShift64)),
(SHR64rCL GR64:$src1)>;
def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SHR64mCL addr:$dst)>;
-def : Pat<(sra GR64:$src1, (and CL, 63)),
+def : Pat<(sra GR64:$src1, (and CL, immShift64)),
(SAR64rCL GR64:$src1)>;
def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SAR64mCL addr:$dst)>;
@@ -1753,3 +1759,11 @@ def : Pat<(and GR64:$src1, i64immSExt8:$src2),
(AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
def : Pat<(and GR64:$src1, i64immSExt32:$src2),
(AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index c228a0a..ba86098 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -1,4 +1,4 @@
-//===- X86InstrControl.td - Control Flow Instructions ------*- tablegen -*-===//
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -20,39 +20,42 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, FPForm = SpecialFP in {
def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
"ret",
- [(X86retflag 0)]>;
+ [(X86retflag 0)], IIC_RET>;
def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
"ret\t$amt",
- [(X86retflag timm:$amt)]>;
+ [(X86retflag timm:$amt)], IIC_RET_IMM>;
def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
"retw\t$amt",
- []>, OpSize;
+ [], IIC_RET_IMM>, OpSize;
def LRETL : I <0xCB, RawFrm, (outs), (ins),
- "lretl", []>;
+ "lretl", [], IIC_RET>;
def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
- "lretq", []>;
+ "lretq", [], IIC_RET>;
def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
- "lret\t$amt", []>;
+ "lret\t$amt", [], IIC_RET>;
def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
- "lretw\t$amt", []>, OpSize;
+ "lretw\t$amt", [], IIC_RET>, OpSize;
}
// Unconditional branches.
let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
- "jmp\t$dst", [(br bb:$dst)]>;
+ "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
- "jmp\t$dst", []>;
+ "jmp\t$dst", [], IIC_JMP_REL>;
+ // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
+ // with JMP_1.
def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
- "jmp{q}\t$dst", []>;
+ "jmpq\t$dst", [], IIC_JMP_REL>;
}
// Conditional Branches.
let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in {
multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
- def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, []>;
+ def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
+ IIC_Jcc>;
def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
- [(X86brcond bb:$dst, Cond, EFLAGS)]>, TB;
+ [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB;
}
}
@@ -74,61 +77,61 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
// jcx/jecx/jrcx instructions.
-let isAsmParserOnly = 1, isBranch = 1, isTerminator = 1 in {
+let isBranch = 1, isTerminator = 1 in {
// These are the 32-bit versions of this instruction for the asmparser. In
// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
// jecxz.
let Uses = [CX] in
def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jcxz\t$dst", []>, AdSize, Requires<[In32BitMode]>;
+ "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>;
let Uses = [ECX] in
def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jecxz\t$dst", []>, Requires<[In32BitMode]>;
+ "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>;
// J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
// In 64-bit mode, the address size prefix is jecxz and the unprefixed version
// is jrcxz.
let Uses = [ECX] in
def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jecxz\t$dst", []>, AdSize, Requires<[In64BitMode]>;
+ "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
let Uses = [RCX] in
def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
- "jrcxz\t$dst", []>, Requires<[In64BitMode]>;
+ "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
}
// Indirect branches
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
- [(brind GR32:$dst)]>, Requires<[In32BitMode]>;
+ [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>;
def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
- [(brind (loadi32 addr:$dst))]>, Requires<[In32BitMode]>;
+ [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>;
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
- [(brind GR64:$dst)]>, Requires<[In64BitMode]>;
+ [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>;
def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
- [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>;
+ [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>;
def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
- "ljmp{w}\t{$seg, $off|$off, $seg}", []>, OpSize;
+ "ljmp{w}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>, OpSize;
def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
- "ljmp{l}\t{$seg, $off|$off, $seg}", []>;
+ "ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>;
def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
- "ljmp{q}\t{*}$dst", []>;
+ "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
- "ljmp{w}\t{*}$dst", []>, OpSize;
+ "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize;
def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
- "ljmp{l}\t{*}$dst", []>;
+ "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
}
// Loop instructions
-def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
-def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
-def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
+def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
//===----------------------------------------------------------------------===//
// Call Instructions...
@@ -138,32 +141,30 @@ let isCall = 1 in
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead. Uses for argument
// registers are added manually.
- let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [ESP] in {
+ let Uses = [ESP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i32imm_pcrel:$dst,variable_ops),
- "call{l}\t$dst", []>, Requires<[In32BitMode]>;
+ "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
- "call{l}\t{*}$dst", [(X86call GR32:$dst)]>,
+ "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
Requires<[In32BitMode]>;
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
- "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
+ "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
Requires<[In32BitMode]>;
def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
- "lcall{w}\t{$seg, $off|$off, $seg}", []>, OpSize;
+ "lcall{w}\t{$seg, $off|$off, $seg}", [],
+ IIC_CALL_FAR_PTR>, OpSize;
def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
- "lcall{l}\t{$seg, $off|$off, $seg}", []>;
+ "lcall{l}\t{$seg, $off|$off, $seg}", [],
+ IIC_CALL_FAR_PTR>;
def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
- "lcall{w}\t{*}$dst", []>, OpSize;
+ "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize;
def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
- "lcall{l}\t{*}$dst", []>;
+ "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
// callw for 16 bit code for the assembler.
let isAsmParserOnly = 1 in
@@ -177,11 +178,7 @@ let isCall = 1 in
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1 in
- let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [ESP] in {
+ let Uses = [ESP] in {
def TCRETURNdi : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
def TCRETURNri : PseudoI<(outs),
@@ -194,74 +191,43 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
// mcinst.
def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
(ins i32imm_pcrel:$dst, variable_ops),
- "jmp\t$dst # TAILCALL",
- []>;
+ "jmp\t$dst # TAILCALL",
+ [], IIC_JMP_REL>;
def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
- "", []>; // FIXME: Remove encoding when JIT is dead.
+ "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
let mayLoad = 1 in
def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
- "jmp{l}\t{*}$dst # TAILCALL", []>;
+ "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
}
//===----------------------------------------------------------------------===//
// Call Instructions...
//
-let isCall = 1 in
- // All calls clobber the non-callee saved registers. RSP is marked as
- // a use to prevent stack-pointer assignments that appear immediately
- // before calls from potentially appearing dead. Uses for argument
- // registers are added manually.
- let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
- FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [RSP] in {
-
- // NOTE: this pattern doesn't match "X86call imm", because we do not know
- // that the offset between an arbitrary immediate and the call will fit in
- // the 32-bit pcrel field that we have.
- def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
- "call{q}\t$dst", []>,
- Requires<[In64BitMode, NotWin64]>;
- def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
- "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
- Requires<[In64BitMode, NotWin64]>;
- def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
- "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
- Requires<[In64BitMode, NotWin64]>;
- def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
- "lcall{q}\t{*}$dst", []>;
- }
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP] in {
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+ "call{q}\t$dst", [], IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+ IIC_CALL_RI>,
+ Requires<[In64BitMode]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+ IIC_CALL_MEM>,
+ Requires<[In64BitMode]>;
- // FIXME: We need to teach codegen about single list of call-clobbered
- // registers.
-let isCall = 1, isCodeGenOnly = 1 in
- // All calls clobber the non-callee saved registers. RSP is marked as
- // a use to prevent stack-pointer assignments that appear immediately
- // before calls from potentially appearing dead. Uses for argument
- // registers are added manually.
- let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
- FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
- Uses = [RSP] in {
- def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
- "call{q}\t$dst", []>,
- Requires<[IsWin64]>;
- def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
- "call{q}\t{*}$dst",
- [(X86call GR64:$dst)]>, Requires<[IsWin64]>;
- def WINCALL64m : I<0xFF, MRM2m, (outs),
- (ins i64mem:$dst,variable_ops),
- "call{q}\t{*}$dst",
- [(X86call (loadi64 addr:$dst))]>,
- Requires<[IsWin64]>;
- }
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+ "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
let isCall = 1, isCodeGenOnly = 1 in
// __chkstk(MSVC): clobber R10, R11 and EFLAGS.
@@ -270,18 +236,13 @@ let isCall = 1, isCodeGenOnly = 1 in
Uses = [RSP] in {
def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
- "call{q}\t$dst", []>,
+ "call{q}\t$dst", [], IIC_CALL_RI>,
Requires<[IsWin64]>;
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1 in
- // AMD64 cc clobbers RSI, RDI, XMM6-XMM15.
- let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
- FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
- Uses = [RSP],
+ let Uses = [RSP],
usesCustomInserter = 1 in {
def TCRETURNdi64 : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
@@ -294,11 +255,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
(ins i64i32imm_pcrel:$dst, variable_ops),
- "jmp\t$dst # TAILCALL", []>;
+ "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>;
def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops),
- "jmp{q}\t{*}$dst # TAILCALL", []>;
+ "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
let mayLoad = 1 in
def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
- "jmp{q}\t{*}$dst # TAILCALL", []>;
+ "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
}
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index e62e6b7..0d5490a 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -1,10 +1,10 @@
-//===- X86InstrExtension.td - Sign and Zero Extensions -----*- tablegen -*-===//
-//
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the sign and zero extension operations.
@@ -37,40 +37,47 @@ let neverHasSideEffects = 1 in {
}
+
// Sign/Zero extenders
def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
- "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+ TB, OpSize;
def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
- "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+ TB, OpSize;
def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR8:$src))]>, TB;
+ [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB;
def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movs{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB;
def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sext GR16:$src))]>, TB;
+ [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB;
def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movs{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+ TB;
def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
- "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+ TB, OpSize;
def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
- "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+ TB, OpSize;
def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR8:$src))]>, TB;
+ [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB;
def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB;
def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zext GR16:$src))]>, TB;
+ [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB;
def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
"movz{wl|x}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+ TB;
// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
// except that they use GR32_NOREX for the output operand register class
@@ -78,12 +85,12 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
(outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- []>, TB;
+ [], IIC_MOVZX>, TB;
let mayLoad = 1 in
def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
(outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
"movz{bl|x}\t{$src, $dst|$dst, $src}",
- []>, TB;
+ [], IIC_MOVZX>, TB;
// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
// operand, which makes it a rare instruction with an 8-bit register
@@ -91,32 +98,38 @@ def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
// were generalized, this would require a special register class.
def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR8:$src))]>, TB;
+ [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB;
def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
"movs{bq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+ TB;
def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR16:$src))]>, TB;
+ [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB;
def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
"movs{wq|x}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+ TB;
def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sext GR32:$src))]>;
+ [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>;
def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
"movs{lq|xd}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>;
// movzbq and movzwq encodings for the disassembler
def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
- "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB;
def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
- "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB;
def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
- "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB;
def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+ TB;
// FIXME: These should be Pat patterns.
let isCodeGenOnly = 1 in {
@@ -124,15 +137,17 @@ let isCodeGenOnly = 1 in {
// Use movzbl instead of movzbq when the destination is a register; it's
// equivalent due to implicit zero-extending, and it has a smaller encoding.
def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
- "", [(set GR64:$dst, (zext GR8:$src))]>, TB;
+ "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB;
def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
- "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+ "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
+ TB;
// Use movzwl instead of movzwq when the destination is a register; it's
// equivalent due to implicit zero-extending, and it has a smaller encoding.
def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
- "", [(set GR64:$dst, (zext GR16:$src))]>, TB;
+ "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB;
def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+ "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
+ IIC_MOVZX>, TB;
// There's no movzlq instruction, but movl can be used for this purpose, using
// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
@@ -142,10 +157,9 @@ def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
// necessarily all zero. In such cases, we fall back to these explicit zext
// instructions.
def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
- "", [(set GR64:$dst, (zext GR32:$src))]>;
+ "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>;
def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
- "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
-
-
+ "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
+ IIC_MOVZX>;
}
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index f443088..d57937b 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -1,4 +1,4 @@
-//====- X86InstrFMA.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,7 +15,7 @@
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
-multiclass fma_rm<bits<8> opc, string OpcodeStr> {
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -34,415 +34,187 @@ multiclass fma_rm<bits<8> opc, string OpcodeStr> {
[]>;
}
-multiclass fma_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpcodeStr, string PackTy> {
- defm r132 : fma_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
- defm r213 : fma_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
- defm r231 : fma_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr, string PackTy> {
+ defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+ defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
+ defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
}
-let isAsmParserOnly = 1 in {
- // Fused Multiply-Add
- defm VFMADDPS : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
- defm VFMADDPD : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
- defm VFMADDSUBPS : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
- defm VFMADDSUBPD : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
- defm VFMSUBADDPS : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
- defm VFMSUBADDPD : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
- defm VFMSUBPS : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
- defm VFMSUBPD : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
+ defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
+ defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
+ defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
+ defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
+ defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
+ defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+ defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
+ defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
+}
+let ExeDomain = SSEPackedDouble in {
+ defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
+ defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+}
+
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
+ def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>;
+ def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, x86memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>;
+}
- // Fused Negative Multiply-Add
- defm VFNMADDPS : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
- defm VFNMADDPD : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
- defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
- defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpcodeStr> {
+ defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>;
+ defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>;
+ defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>;
+ defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W;
+ defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W;
+ defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W;
}
+defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG;
+defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG;
+
+defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG;
+defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG;
+
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
-multiclass fma4s<bits<8> opc, string OpcodeStr> {
+multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic Int> {
def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4;
def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ (ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>;
-
+ [(set VR128:$dst,
+ (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+// For disassembler
+let isCodeGenOnly = 1 in
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
}
-multiclass fma4p<bits<8> opc, string OpcodeStr> {
+multiclass fma4p<bits<8> opc, string OpcodeStr,
+ Intrinsic Int128, Intrinsic Int256,
+ PatFrag ld_frag128, PatFrag ld_frag256> {
def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2,
+ (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>;
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4;
def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_W;
+ [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2,
+ (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>;
-}
-
-let isAsmParserOnly = 1 in {
- defm VFMADDSS4 : fma4s<0x6A, "vfmaddss">;
- defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd">;
- defm VFMADDPS4 : fma4p<0x68, "vfmaddps">;
- defm VFMADDPD4 : fma4p<0x69, "vfmaddpd">;
- defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss">;
- defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd">;
- defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps">;
- defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd">;
- defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss">;
- defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd">;
- defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps">;
- defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd">;
- defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss">;
- defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd">;
- defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps">;
- defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd">;
- defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps">;
- defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd">;
- defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps">;
- defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd">;
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+// For disassembler
+let isCodeGenOnly = 1 in {
+ def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+ def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+} // isCodeGenOnly = 1
}
-// FMA4 Intrinsics patterns
-
-// VFMADD
-def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-// VFMSUB
-def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-// VFNMADD
-def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFNMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFNMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFNMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFNMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFNMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFNMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFNMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFNMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFNMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFNMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFNMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFNMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFNMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFNMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-// VFNMSUB
-def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFNMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFNMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFNMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFNMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFNMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFNMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFNMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFNMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFNMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFNMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFNMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFNMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFNMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFNMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFNMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-// VFMADDSUB
-def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMADDSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMADDSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMADDSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMADDSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMADDSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMADDSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFMADDSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFMADDSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMADDSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFMADDSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFMADDSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-// VFMSUBADD
-def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2,
- (alignedloadv4f32 addr:$src3)),
- (VFMSUBADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
- VR128:$src3),
- (VFMSUBADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
- (VFMSUBADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2,
- (alignedloadv2f64 addr:$src3)),
- (VFMSUBADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
- VR128:$src3),
- (VFMSUBADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMSUBADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2,
- (alignedloadv8f32 addr:$src3)),
- (VFMSUBADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1,
- (alignedloadv8f32 addr:$src2),
- VR256:$src3),
- (VFMSUBADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
-
-def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
- (VFMSUBADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2,
- (alignedloadv4f64 addr:$src3)),
- (VFMSUBADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
-def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1,
- (alignedloadv4f64 addr:$src2),
- VR256:$src3),
- (VFMSUBADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
+ int_x86_fma4_vfmadd_ss>;
+defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+ int_x86_fma4_vfmadd_sd>;
+defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps,
+ int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd,
+ int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>;
+defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
+ int_x86_fma4_vfmsub_ss>;
+defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+ int_x86_fma4_vfmsub_sd>;
+defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps,
+ int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd,
+ int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+ int_x86_fma4_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+ int_x86_fma4_vfnmadd_sd>;
+defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps,
+ int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd,
+ int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+ int_x86_fma4_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+ int_x86_fma4_vfnmsub_sd>;
+defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps,
+ int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd,
+ int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps,
+ int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd,
+ int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps,
+ int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd,
+ int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 7cb870f..a13887e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -1,10 +1,10 @@
-//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=//
-//
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 x87 FPU instruction set, defining the
@@ -225,22 +225,22 @@ class FPrST0PInst<bits<8> o, string asm>
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
-def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
-def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
-def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
-def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
-def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
-def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">;
@@ -330,21 +330,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA;
+ "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA;
+ "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA;
+ "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA;
+ "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB;
+ "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB;
+ "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB;
+ "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB;
+ "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
} // Predicates = [HasCMov]
// Floating point loads & stores.
@@ -437,33 +437,26 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
}
// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
- [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
- [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
- [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
- [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
- [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
- [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
- [(X86fp_to_i16mem RFP80:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
- [(X86fp_to_i32mem RFP80:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
- [(X86fp_to_i64mem RFP80:$src, addr:$op)]>,
- Requires<[HasSSE3]>;
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
let mayStore = 1 in {
def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 7ba3639..b387090 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -1,10 +1,10 @@
-//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===//
-//
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -43,6 +43,15 @@ def RawFrmImm8 : Format<43>;
def RawFrmImm16 : Format<44>;
def MRM_D0 : Format<45>;
def MRM_D1 : Format<46>;
+def MRM_D4 : Format<47>;
+def MRM_D8 : Format<48>;
+def MRM_D9 : Format<49>;
+def MRM_DA : Format<50>;
+def MRM_DB : Format<51>;
+def MRM_DC : Format<52>;
+def MRM_DD : Format<53>;
+def MRM_DE : Format<54>;
+def MRM_DF : Format<55>;
// ImmType - This specifies the immediate type used by an instruction. This is
// part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -120,10 +129,12 @@ class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
-class XOP_W { bit hasXOP_WPrefix = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
class XOP { bit hasXOP_Prefix = 1; }
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
- string AsmStr, Domain d = GenericDomain>
+ string AsmStr,
+ InstrItinClass itin,
+ Domain d = GenericDomain>
: Instruction {
let Namespace = "X86";
@@ -139,6 +150,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// If this is a pseudo instruction, mark it isCodeGenOnly.
let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+ let Itinerary = itin;
+
//
// Attributes specific to X86 instructions...
//
@@ -161,7 +174,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
- bit hasXOP_WPrefix = 0; // Same bit as VEX_W, but used for swapping operands
+ bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix?
// TSFlags layout should be kept in sync with X86InstrInfo.h.
@@ -184,56 +197,58 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{38} = hasVEX_L;
let TSFlags{39} = ignoresVEX_L;
let TSFlags{40} = has3DNow0F0FOpcode;
- let TSFlags{41} = hasXOP_WPrefix;
+ let TSFlags{41} = hasMemOp4Prefix;
let TSFlags{42} = hasXOP_Prefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
- : X86Inst<0, Pseudo, NoImm, oops, iops, ""> {
+ : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
let Pattern = pattern;
}
class I<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, Domain d = GenericDomain>
- : X86Inst<o, f, NoImm, outs, ins, asm, d> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern, Domain d = GenericDomain>
- : X86Inst<o, f, Imm8, outs, ins, asm, d> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm8PCRel, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm16, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm32, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm16PCRel, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
@@ -244,8 +259,9 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
: I<o, F, outs, ins, asm, []> {}
// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
-class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
- : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
let FPForm = fp;
let Pattern = pattern;
}
@@ -257,20 +273,23 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
// Iseg32 - 16-bit segment selector, 32-bit offset
class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern> : X86Inst<o, f, Imm16, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern> : X86Inst<o, f, Imm32, outs, ins, asm> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
let Pattern = pattern;
let CodeSize = 3;
}
// SI - SSE 1 & 2 scalar instructions
-class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern> {
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
@@ -280,8 +299,8 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
// SIi8 - SSE 1 & 2 scalar instructions
class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern> {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
@@ -291,8 +310,8 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// PI - SSE 1 & 2 packed instructions
class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
- Domain d>
- : I<o, F, outs, ins, asm, pattern, d> {
+ InstrItinClass itin, Domain d>
+ : I<o, F, outs, ins, asm, pattern, itin, d> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
@@ -302,8 +321,8 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
// PIi8 - SSE 1 & 2 packed instructions with immediate
class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern, Domain d>
- : Ii8<o, F, outs, ins, asm, pattern, d> {
+ list<dag> pattern, InstrItinClass itin, Domain d>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
@@ -319,30 +338,28 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// VSSI - SSE1 instructions with XS prefix in AVX form.
// VPSI - SSE1 instructions with TB prefix in AVX form.
-class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
-class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
Requires<[HasSSE1]>;
class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
Requires<[HasSSE1]>;
class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
Requires<[HasAVX]>;
class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB,
Requires<[HasAVX]>;
-class VoPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
- Requires<[HasXMM]>;
// SSE2 Instruction Templates:
//
@@ -354,28 +371,30 @@ class VoPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
// VSDI - SSE2 instructions with XD prefix in AVX form.
// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form.
-class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern>
: Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
-class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[HasSSE2]>;
class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[HasSSE2]>;
class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
Requires<[HasAVX]>;
class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, TB,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
OpSize, Requires<[HasAVX]>;
// SSE3 Instruction Templates:
@@ -385,15 +404,16 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
// S3DI - SSE3 instructions with XD prefix.
class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
Requires<[HasSSE3]>;
class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
Requires<[HasSSE3]>;
-class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
Requires<[HasSSE3]>;
@@ -403,16 +423,16 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
// SS3AI - SSSE3 instructions with TA prefix.
//
// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
-// uses the MMX registers. We put those instructions here because they better
-// fit into the SSSE3 instruction category rather than the MMX category.
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[HasSSSE3]>;
class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[HasSSSE3]>;
// SSE4.1 Instruction Templates:
@@ -421,31 +441,31 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
//
class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[HasSSE41]>;
class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[HasSSE41]>;
// SSE4.2 Instruction Templates:
//
// SS428I - SSE 4.2 instructions with T8 prefix.
class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
Requires<[HasSSE42]>;
// SS42FI - SSE 4.2 instructions with T8XD prefix.
class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>;
-
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
// SS42AI = SSE 4.2 instructions with TA prefix
class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
Requires<[HasSSE42]>;
// AVX Instruction Templates:
@@ -454,12 +474,12 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX8I - AVX instructions with T8 and OpSize prefix.
// AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
Requires<[HasAVX]>;
class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
Requires<[HasAVX]>;
// AVX2 Instruction Templates:
@@ -468,12 +488,12 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// AVX28I - AVX2 instructions with T8 and OpSize prefix.
// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8.
class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
Requires<[HasAVX2]>;
class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
Requires<[HasAVX2]>;
// AES Instruction Templates:
@@ -481,87 +501,88 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
// AES8I
// These use the same encoding as the SSE4.2 T8 and TA encodings.
class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
- Requires<[HasAES]>;
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+ Requires<[HasSSE2, HasAES]>;
class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
- Requires<[HasAES]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ Requires<[HasSSE2, HasAES]>;
// CLMUL Instruction Templates
class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
- OpSize, Requires<[HasCLMUL]>;
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+ OpSize, Requires<[HasSSE2, HasCLMUL]>;
class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>;
// FMA3 Instruction Templates
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8,
OpSize, VEX_4V, Requires<[HasFMA3]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP, XOP9, Requires<[HasXOP]>;
// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP, XOP8, Requires<[HasXOP]>;
// XOP 5 operand instruction (VEX encoding!)
class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern>
- : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
// X86-64 Instruction templates...
//
-class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
- list<dag> pattern>
- : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W {
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
let Pattern = pattern;
let CodeSize = 3;
}
class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : SSI<o, F, outs, ins, asm, pattern>, REX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : SDI<o, F, outs, ins, asm, pattern>, REX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : VPDI<o, F, outs, ins, asm, pattern>, VEX_W;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
// MMX Instruction templates
//
@@ -574,23 +595,23 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
// MMXID - MMX instructions with XD prefix.
// MMXIS - MMX instructions with XS prefix.
class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>;
class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>;
class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
- : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index cd13bc4..4f9f089 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -1,10 +1,10 @@
-//======- X86InstrFragmentsSIMD.td - x86 ISA -------------*- tablegen -*-=====//
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file provides pattern fragments useful for SIMD instructions.
@@ -48,7 +48,7 @@ def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>;
def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>;
def X86pshufb : SDNode<"X86ISD::PSHUFB",
- SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86andnp : SDNode<"X86ISD::ANDNP",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -71,20 +71,30 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
+
def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>;
-def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>;
-def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>;
-def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>;
-def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
-def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
-def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
-def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
-def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86vshl : SDNode<"X86ISD::VSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsrl : SDNode<"X86ISD::VSRL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+def X86vsra : SDNode<"X86ISD::VSRA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisVec<1>,
@@ -92,6 +102,17 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86vpcom : SDNode<"X86ISD::VPCOM",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
+def X86vpcomu : SDNode<"X86ISD::VPCOMU",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>]>>;
+
// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
// translated into one of the target nodes below during lowering.
// Note: this is a work in progress...
@@ -112,8 +133,7 @@ def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
-def X86Shufpd : SDNode<"X86ISD::SHUFPD", SDTShuff3OpI>;
-def X86Shufps : SDNode<"X86ISD::SHUFPS", SDTShuff3OpI>;
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
@@ -125,7 +145,6 @@ def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
-def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>;
def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
@@ -171,15 +190,15 @@ def sdmem : Operand<v2f64> {
//===----------------------------------------------------------------------===//
// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
-def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
-def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
// Like 'store', but always requires 128-bit vector alignment.
@@ -210,22 +229,20 @@ def alignedloadfsf64 : PatFrag<(ops node:$ptr),
(f64 (alignedload node:$ptr))>;
// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
def alignedloadv4f32 : PatFrag<(ops node:$ptr),
(v4f32 (alignedload node:$ptr))>;
def alignedloadv2f64 : PatFrag<(ops node:$ptr),
(v2f64 (alignedload node:$ptr))>;
-def alignedloadv4i32 : PatFrag<(ops node:$ptr),
- (v4i32 (alignedload node:$ptr))>;
def alignedloadv2i64 : PatFrag<(ops node:$ptr),
(v2i64 (alignedload node:$ptr))>;
// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
def alignedloadv8f32 : PatFrag<(ops node:$ptr),
(v8f32 (alignedload256 node:$ptr))>;
def alignedloadv4f64 : PatFrag<(ops node:$ptr),
(v4f64 (alignedload256 node:$ptr))>;
-def alignedloadv8i32 : PatFrag<(ops node:$ptr),
- (v8i32 (alignedload256 node:$ptr))>;
def alignedloadv4i64 : PatFrag<(ops node:$ptr),
(v4i64 (alignedload256 node:$ptr))>;
@@ -244,20 +261,16 @@ def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
-def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
-def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
// 256-bit memop pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
-def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>;
-def memopv16i16 : PatFrag<(ops node:$ptr), (v16i16 (memop node:$ptr))>;
-def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>;
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.
@@ -329,24 +342,6 @@ def BYTE_imm : SDNodeXForm<imm, [{
return getI32Imm(N->getZExtValue() >> 3);
}]>;
-// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
-// SHUFP* etc. imm.
-def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
- return getI8Imm(X86::getShuffleSHUFImmediate(N));
-}]>;
-
-// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
-// PSHUFHW imm.
-def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
- return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
-}]>;
-
-// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
-// PSHUFLW imm.
-def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
- return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
-}]>;
-
// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
// to VEXTRACTF128 imm.
def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
@@ -359,72 +354,6 @@ def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
}]>;
-def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
-}]>;
-
-def movddup : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movlp : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movl : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N), Subtarget->hasAVX2());
-}]>;
-
-def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N), Subtarget->hasAVX2());
-}]>;
-
-def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_shuf_imm>;
-
-def shufp : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_shuf_imm>;
-
-def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_pshufhw_imm>;
-
-def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
- (vector_shuffle node:$lhs, node:$rhs), [{
- return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_pshuflw_imm>;
-
def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
node:$index), [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7d1b9a1..5a479f0 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1,4 +1,4 @@
-//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -274,7 +274,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
{ X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
{ X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
- { X86::WINCALL64r, X86::WINCALL64m, TB_FOLDED_LOAD },
{ X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
{ X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
{ X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
@@ -351,6 +350,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -361,6 +361,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
// AVX 256-bit foldable instructions
+ { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -513,6 +514,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VPABSBrr128, X86::VPABSBrm128, TB_ALIGN_16 },
{ X86::VPABSDrr128, X86::VPABSDrm128, TB_ALIGN_16 },
{ X86::VPABSWrr128, X86::VPABSWrm128, TB_ALIGN_16 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, TB_ALIGN_16 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, TB_ALIGN_16 },
{ X86::VPSHUFDri, X86::VPSHUFDmi, TB_ALIGN_16 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, TB_ALIGN_16 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, TB_ALIGN_16 },
@@ -529,16 +532,26 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
// AVX 256-bit foldable instructions
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
- { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_16 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 },
// AVX2 foldable instructions
- { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_16 },
- { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_16 },
- { X86::VPABSWrr256, X86::VPABSWrm256, TB_ALIGN_16 },
- { X86::VPSHUFDYri, X86::VPSHUFDYmi, TB_ALIGN_16 },
- { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, TB_ALIGN_16 },
- { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, TB_ALIGN_16 }
+ { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 },
+ { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 },
+ { X86::VPABSWrr256, X86::VPABSWrm256, TB_ALIGN_32 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, TB_ALIGN_32 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, TB_ALIGN_32 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, TB_ALIGN_32 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, TB_ALIGN_32 },
+ { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, TB_ALIGN_32 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, TB_ALIGN_32 },
+ { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, TB_ALIGN_32 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, TB_ALIGN_32 },
+ { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 },
+ { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -575,6 +588,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
{ X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
{ X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
+ { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
+ { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
+ { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
+ { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
{ X86::CMOVA16rr, X86::CMOVA16rm, 0 },
{ X86::CMOVA32rr, X86::CMOVA32rm, 0 },
{ X86::CMOVA64rr, X86::CMOVA64rm, 0 },
@@ -692,6 +709,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
@@ -700,12 +718,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
- { X86::PHADDDrr128, X86::PHADDDrm128, TB_ALIGN_16 },
- { X86::PHADDWrr128, X86::PHADDWrm128, TB_ALIGN_16 },
+ { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
+ { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
- { X86::PHSUBDrr128, X86::PHSUBDrm128, TB_ALIGN_16 },
+ { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
- { X86::PHSUBWrr128, X86::PHSUBWrm128, TB_ALIGN_16 },
+ { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
{ X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 },
{ X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
@@ -722,10 +740,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
{ X86::PORrr, X86::PORrm, TB_ALIGN_16 },
{ X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
- { X86::PSHUFBrr128, X86::PSHUFBrm128, TB_ALIGN_16 },
- { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
- { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
- { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
+ { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
+ { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 },
+ { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 },
+ { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 },
{ X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
{ X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
@@ -809,6 +827,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VANDNPSrr, X86::VANDNPSrm, TB_ALIGN_16 },
{ X86::VANDPDrr, X86::VANDPDrm, TB_ALIGN_16 },
{ X86::VANDPSrr, X86::VANDPSrm, TB_ALIGN_16 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, TB_ALIGN_16 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, TB_ALIGN_16 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, TB_ALIGN_16 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, TB_ALIGN_16 },
{ X86::VCMPPDrri, X86::VCMPPDrmi, TB_ALIGN_16 },
{ X86::VCMPPSrri, X86::VCMPPSrmi, TB_ALIGN_16 },
{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
@@ -871,6 +893,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VPANDrr, X86::VPANDrm, TB_ALIGN_16 },
{ X86::VPAVGBrr, X86::VPAVGBrm, TB_ALIGN_16 },
{ X86::VPAVGWrr, X86::VPAVGWrm, TB_ALIGN_16 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, TB_ALIGN_16 },
{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, TB_ALIGN_16 },
{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, TB_ALIGN_16 },
{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, TB_ALIGN_16 },
@@ -879,12 +902,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, TB_ALIGN_16 },
{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, TB_ALIGN_16 },
{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, TB_ALIGN_16 },
- { X86::VPHADDDrr128, X86::VPHADDDrm128, TB_ALIGN_16 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, TB_ALIGN_16 },
{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, TB_ALIGN_16 },
- { X86::VPHADDWrr128, X86::VPHADDWrm128, TB_ALIGN_16 },
- { X86::VPHSUBDrr128, X86::VPHSUBDrm128, TB_ALIGN_16 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, TB_ALIGN_16 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, TB_ALIGN_16 },
{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, TB_ALIGN_16 },
- { X86::VPHSUBWrr128, X86::VPHSUBWrm128, TB_ALIGN_16 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, TB_ALIGN_16 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, TB_ALIGN_16 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, TB_ALIGN_16 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, TB_ALIGN_16 },
{ X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, TB_ALIGN_16 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, TB_ALIGN_16 },
@@ -901,10 +926,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VPMULUDQrr, X86::VPMULUDQrm, TB_ALIGN_16 },
{ X86::VPORrr, X86::VPORrm, TB_ALIGN_16 },
{ X86::VPSADBWrr, X86::VPSADBWrm, TB_ALIGN_16 },
- { X86::VPSHUFBrr128, X86::VPSHUFBrm128, TB_ALIGN_16 },
- { X86::VPSIGNBrr128, X86::VPSIGNBrm128, TB_ALIGN_16 },
- { X86::VPSIGNWrr128, X86::VPSIGNWrm128, TB_ALIGN_16 },
- { X86::VPSIGNDrr128, X86::VPSIGNDrm128, TB_ALIGN_16 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, TB_ALIGN_16 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, TB_ALIGN_16 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, TB_ALIGN_16 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, TB_ALIGN_16 },
{ X86::VPSLLDrr, X86::VPSLLDrm, TB_ALIGN_16 },
{ X86::VPSLLQrr, X86::VPSLLQrm, TB_ALIGN_16 },
{ X86::VPSLLWrr, X86::VPSLLWrm, TB_ALIGN_16 },
@@ -939,90 +964,146 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, TB_ALIGN_16 },
{ X86::VXORPDrr, X86::VXORPDrm, TB_ALIGN_16 },
{ X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 },
+ // AVX 256-bit foldable instructions
+ { X86::VADDPDYrr, X86::VADDPDYrm, TB_ALIGN_32 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, TB_ALIGN_32 },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, TB_ALIGN_32 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, TB_ALIGN_32 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, TB_ALIGN_32 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, TB_ALIGN_32 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, TB_ALIGN_32 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, TB_ALIGN_32 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, TB_ALIGN_32 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, TB_ALIGN_32 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, TB_ALIGN_32 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, TB_ALIGN_32 },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, TB_ALIGN_32 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, TB_ALIGN_32 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, TB_ALIGN_32 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, TB_ALIGN_32 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, TB_ALIGN_32 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, TB_ALIGN_32 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, TB_ALIGN_32 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, TB_ALIGN_32 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, TB_ALIGN_32 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, TB_ALIGN_32 },
+ { X86::VMAXPDYrr_Int, X86::VMAXPDYrm_Int, TB_ALIGN_32 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, TB_ALIGN_32 },
+ { X86::VMAXPSYrr_Int, X86::VMAXPSYrm_Int, TB_ALIGN_32 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, TB_ALIGN_32 },
+ { X86::VMINPDYrr_Int, X86::VMINPDYrm_Int, TB_ALIGN_32 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, TB_ALIGN_32 },
+ { X86::VMINPSYrr_Int, X86::VMINPSYrm_Int, TB_ALIGN_32 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, TB_ALIGN_32 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, TB_ALIGN_32 },
+ { X86::VORPDYrr, X86::VORPDYrm, TB_ALIGN_32 },
+ { X86::VORPSYrr, X86::VORPSYrm, TB_ALIGN_32 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, TB_ALIGN_32 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, TB_ALIGN_32 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, TB_ALIGN_32 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, TB_ALIGN_32 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, TB_ALIGN_32 },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, TB_ALIGN_32 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, TB_ALIGN_32 },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, TB_ALIGN_32 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, TB_ALIGN_32 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, TB_ALIGN_32 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, TB_ALIGN_32 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, TB_ALIGN_32 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, TB_ALIGN_32 },
// AVX2 foldable instructions
- { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, TB_ALIGN_16 },
- { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, TB_ALIGN_16 },
- { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, TB_ALIGN_16 },
- { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, TB_ALIGN_16 },
- { X86::VPADDBYrr, X86::VPADDBYrm, TB_ALIGN_16 },
- { X86::VPADDDYrr, X86::VPADDDYrm, TB_ALIGN_16 },
- { X86::VPADDQYrr, X86::VPADDQYrm, TB_ALIGN_16 },
- { X86::VPADDSBYrr, X86::VPADDSBYrm, TB_ALIGN_16 },
- { X86::VPADDSWYrr, X86::VPADDSWYrm, TB_ALIGN_16 },
- { X86::VPADDUSBYrr, X86::VPADDUSBYrm, TB_ALIGN_16 },
- { X86::VPADDUSWYrr, X86::VPADDUSWYrm, TB_ALIGN_16 },
- { X86::VPADDWYrr, X86::VPADDWYrm, TB_ALIGN_16 },
- { X86::VPALIGNR256rr, X86::VPALIGNR256rm, TB_ALIGN_16 },
- { X86::VPANDNYrr, X86::VPANDNYrm, TB_ALIGN_16 },
- { X86::VPANDYrr, X86::VPANDYrm, TB_ALIGN_16 },
- { X86::VPAVGBYrr, X86::VPAVGBYrm, TB_ALIGN_16 },
- { X86::VPAVGWYrr, X86::VPAVGWYrm, TB_ALIGN_16 },
- { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, TB_ALIGN_16 },
- { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, TB_ALIGN_16 },
- { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, TB_ALIGN_16 },
- { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, TB_ALIGN_16 },
- { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, TB_ALIGN_16 },
- { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, TB_ALIGN_16 },
- { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, TB_ALIGN_16 },
- { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_16 },
- { X86::VPHADDDrr256, X86::VPHADDDrm256, TB_ALIGN_16 },
- { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_16 },
- { X86::VPHADDWrr256, X86::VPHADDWrm256, TB_ALIGN_16 },
- { X86::VPHSUBDrr256, X86::VPHSUBDrm256, TB_ALIGN_16 },
- { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, TB_ALIGN_16 },
- { X86::VPHSUBWrr256, X86::VPHSUBWrm256, TB_ALIGN_16 },
- { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, TB_ALIGN_16 },
- { X86::VPMADDWDYrr, X86::VPMADDWDYrm, TB_ALIGN_16 },
- { X86::VPMAXSWYrr, X86::VPMAXSWYrm, TB_ALIGN_16 },
- { X86::VPMAXUBYrr, X86::VPMAXUBYrm, TB_ALIGN_16 },
- { X86::VPMINSWYrr, X86::VPMINSWYrm, TB_ALIGN_16 },
- { X86::VPMINUBYrr, X86::VPMINUBYrm, TB_ALIGN_16 },
- { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, TB_ALIGN_16 },
- { X86::VPMULDQYrr, X86::VPMULDQYrm, TB_ALIGN_16 },
- { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, TB_ALIGN_16 },
- { X86::VPMULHUWYrr, X86::VPMULHUWYrm, TB_ALIGN_16 },
- { X86::VPMULHWYrr, X86::VPMULHWYrm, TB_ALIGN_16 },
- { X86::VPMULLDYrr, X86::VPMULLDYrm, TB_ALIGN_16 },
- { X86::VPMULLWYrr, X86::VPMULLWYrm, TB_ALIGN_16 },
- { X86::VPMULUDQYrr, X86::VPMULUDQYrm, TB_ALIGN_16 },
- { X86::VPORYrr, X86::VPORYrm, TB_ALIGN_16 },
- { X86::VPSADBWYrr, X86::VPSADBWYrm, TB_ALIGN_16 },
- { X86::VPSHUFBrr256, X86::VPSHUFBrm256, TB_ALIGN_16 },
- { X86::VPSIGNBrr256, X86::VPSIGNBrm256, TB_ALIGN_16 },
- { X86::VPSIGNWrr256, X86::VPSIGNWrm256, TB_ALIGN_16 },
- { X86::VPSIGNDrr256, X86::VPSIGNDrm256, TB_ALIGN_16 },
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, TB_ALIGN_16 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, TB_ALIGN_32 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, TB_ALIGN_32 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, TB_ALIGN_32 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, TB_ALIGN_32 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, TB_ALIGN_32 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, TB_ALIGN_32 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, TB_ALIGN_32 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, TB_ALIGN_32 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, TB_ALIGN_32 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, TB_ALIGN_32 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, TB_ALIGN_32 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, TB_ALIGN_32 },
+ { X86::VPALIGNR256rr, X86::VPALIGNR256rm, TB_ALIGN_32 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, TB_ALIGN_32 },
+ { X86::VPANDYrr, X86::VPANDYrm, TB_ALIGN_32 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, TB_ALIGN_32 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, TB_ALIGN_32 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, TB_ALIGN_32 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, TB_ALIGN_32 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, TB_ALIGN_32 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, TB_ALIGN_32 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, TB_ALIGN_32 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, TB_ALIGN_32 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, TB_ALIGN_32 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, TB_ALIGN_32 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, TB_ALIGN_32 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, TB_ALIGN_32 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 },
+ { X86::VPERMPDYrr, X86::VPERMPDYrm, TB_ALIGN_32 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 },
+ { X86::VPERMQYrr, X86::VPERMQYrm, TB_ALIGN_32 },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 },
+ { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, TB_ALIGN_32 },
+ { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, TB_ALIGN_32 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, TB_ALIGN_32 },
+ { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, TB_ALIGN_32 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, TB_ALIGN_32 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, TB_ALIGN_32 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, TB_ALIGN_32 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, TB_ALIGN_32 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, TB_ALIGN_32 },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, TB_ALIGN_32 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, TB_ALIGN_32 },
+ { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, TB_ALIGN_32 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, TB_ALIGN_32 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, TB_ALIGN_32 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, TB_ALIGN_32 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, TB_ALIGN_32 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, TB_ALIGN_32 },
+ { X86::VPORYrr, X86::VPORYrm, TB_ALIGN_32 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, TB_ALIGN_32 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, TB_ALIGN_32 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, TB_ALIGN_32 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, TB_ALIGN_32 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, TB_ALIGN_32 },
{ X86::VPSLLDYrr, X86::VPSLLDYrm, TB_ALIGN_16 },
{ X86::VPSLLQYrr, X86::VPSLLQYrm, TB_ALIGN_16 },
{ X86::VPSLLWYrr, X86::VPSLLWYrm, TB_ALIGN_16 },
{ X86::VPSLLVDrr, X86::VPSLLVDrm, TB_ALIGN_16 },
- { X86::VPSLLVDYrr, X86::VPSLLVDYrm, TB_ALIGN_16 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, TB_ALIGN_32 },
{ X86::VPSLLVQrr, X86::VPSLLVQrm, TB_ALIGN_16 },
- { X86::VPSLLVQYrr, X86::VPSLLVQYrm, TB_ALIGN_16 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, TB_ALIGN_32 },
{ X86::VPSRADYrr, X86::VPSRADYrm, TB_ALIGN_16 },
{ X86::VPSRAWYrr, X86::VPSRAWYrm, TB_ALIGN_16 },
{ X86::VPSRAVDrr, X86::VPSRAVDrm, TB_ALIGN_16 },
- { X86::VPSRAVDYrr, X86::VPSRAVDYrm, TB_ALIGN_16 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, TB_ALIGN_32 },
{ X86::VPSRLDYrr, X86::VPSRLDYrm, TB_ALIGN_16 },
{ X86::VPSRLQYrr, X86::VPSRLQYrm, TB_ALIGN_16 },
{ X86::VPSRLWYrr, X86::VPSRLWYrm, TB_ALIGN_16 },
{ X86::VPSRLVDrr, X86::VPSRLVDrm, TB_ALIGN_16 },
- { X86::VPSRLVDYrr, X86::VPSRLVDYrm, TB_ALIGN_16 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, TB_ALIGN_32 },
{ X86::VPSRLVQrr, X86::VPSRLVQrm, TB_ALIGN_16 },
- { X86::VPSRLVQYrr, X86::VPSRLVQYrm, TB_ALIGN_16 },
- { X86::VPSUBBYrr, X86::VPSUBBYrm, TB_ALIGN_16 },
- { X86::VPSUBDYrr, X86::VPSUBDYrm, TB_ALIGN_16 },
- { X86::VPSUBSBYrr, X86::VPSUBSBYrm, TB_ALIGN_16 },
- { X86::VPSUBSWYrr, X86::VPSUBSWYrm, TB_ALIGN_16 },
- { X86::VPSUBWYrr, X86::VPSUBWYrm, TB_ALIGN_16 },
- { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, TB_ALIGN_16 },
- { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, TB_ALIGN_16 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, TB_ALIGN_32 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, TB_ALIGN_32 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, TB_ALIGN_32 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, TB_ALIGN_32 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, TB_ALIGN_32 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, TB_ALIGN_32 },
{ X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, TB_ALIGN_16 },
- { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, TB_ALIGN_16 },
- { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, TB_ALIGN_16 },
- { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, TB_ALIGN_16 },
- { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, TB_ALIGN_16 },
- { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_16 },
- { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_16 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, TB_ALIGN_32 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 },
+ { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 },
// FIXME: add AVX 256-bit foldable instructions
};
@@ -1082,7 +1163,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
switch (MI.getOpcode()) {
default:
llvm_unreachable(0);
- break;
case X86::MOVSX16rr8:
case X86::MOVZX16rr8:
case X86::MOVSX32rr8:
@@ -1125,7 +1205,8 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
static bool isFrameLoadOpcode(int Opcode) {
switch (Opcode) {
- default: break;
+ default:
+ return false;
case X86::MOV8rm:
case X86::MOV16rm:
case X86::MOV32rm:
@@ -1147,9 +1228,7 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
return true;
- break;
}
- return false;
}
static bool isFrameStoreOpcode(int Opcode) {
@@ -1339,6 +1418,8 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
bool SeenDef = false;
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
MachineOperand &MO = Iter->getOperand(j);
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SeenDef = true;
if (!MO.isReg())
continue;
if (MO.getReg() == X86::EFLAGS) {
@@ -1383,6 +1464,10 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
bool SawKill = false;
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
MachineOperand &MO = Iter->getOperand(j);
+ // A register mask may clobber EFLAGS, but we should still look for a
+ // live EFLAGS def.
+ if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+ SawKill = true;
if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
if (MO.isDef()) return MO.isDead();
if (MO.isKill()) SawKill = true;
@@ -1493,7 +1578,6 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
switch (MIOpc) {
default:
llvm_unreachable(0);
- break;
case X86::SHL16ri: {
unsigned ShAmt = MI->getOperand(2).getImm();
MIB.addReg(0).addImm(1 << ShAmt)
@@ -1605,6 +1689,24 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
.addReg(B, getKillRegState(isKill)).addImm(M);
break;
}
+ case X86::SHUFPDrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ if (B != C) return 0;
+ unsigned A = MI->getOperand(0).getReg();
+ unsigned M = MI->getOperand(3).getImm();
+
+ // Convert to PSHUFD mask.
+ M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+ .addReg(A, RegState::Define | getDeadRegState(isDead))
+ .addReg(B, getKillRegState(isKill)).addImm(M);
+ break;
+ }
case X86::SHL64ri: {
assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
// NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
@@ -1733,7 +1835,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::ADD32rr_DB: {
assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc;
- TargetRegisterClass *RC;
+ const TargetRegisterClass *RC;
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
Opc = X86::LEA64r;
RC = X86::GR64_NOSPRegisterClass;
@@ -2908,6 +3010,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
case X86::AVX_SET0PSY:
case X86::AVX_SET0PDY:
case X86::AVX2_SETALLONES:
+ case X86::AVX2_SET0:
Alignment = 32;
break;
case X86::V_SET0:
@@ -2952,6 +3055,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
case X86::AVX_SET0PDY:
case X86::AVX_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX2_SET0:
case X86::FsFLD0SD:
case X86::FsFLD0SS: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -2985,6 +3089,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
+ else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0)
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index ee488d8..d065d2d 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -1,4 +1,4 @@
-//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0bc3afa..f585b47 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1,4 +1,4 @@
-//===- X86InstrInfo.td - Main X86 Instruction Definition ---*- tablegen -*-===//
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -99,17 +99,16 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+
def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
-def SDT_X86MEMBARRIERNoSSE : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
[SDNPHasChain]>;
-def X86MemBarrierNoSSE : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIERNoSSE,
- [SDNPHasChain]>;
def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
[SDNPHasChain]>;
def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
@@ -241,6 +240,9 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
+ [SDNPHasChain, SDNPOutGlue]>;
+
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -251,10 +253,31 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
// *mem - Operand definitions for the funky X86 addressing mode operands.
//
-def X86MemAsmOperand : AsmOperandClass {
- let Name = "Mem";
- let SuperClasses = [];
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem"; let PredicateMethod = "isMem";
+}
+def X86Mem8AsmOperand : AsmOperandClass {
+ let Name = "Mem8"; let PredicateMethod = "isMem8";
+}
+def X86Mem16AsmOperand : AsmOperandClass {
+ let Name = "Mem16"; let PredicateMethod = "isMem16";
+}
+def X86Mem32AsmOperand : AsmOperandClass {
+ let Name = "Mem32"; let PredicateMethod = "isMem32";
+}
+def X86Mem64AsmOperand : AsmOperandClass {
+ let Name = "Mem64"; let PredicateMethod = "isMem64";
+}
+def X86Mem80AsmOperand : AsmOperandClass {
+ let Name = "Mem80"; let PredicateMethod = "isMem80";
}
+def X86Mem128AsmOperand : AsmOperandClass {
+ let Name = "Mem128"; let PredicateMethod = "isMem128";
+}
+def X86Mem256AsmOperand : AsmOperandClass {
+ let Name = "Mem256"; let PredicateMethod = "isMem256";
+}
+
def X86AbsMemAsmOperand : AsmOperandClass {
let Name = "AbsMem";
let SuperClasses = [X86MemAsmOperand];
@@ -271,17 +294,28 @@ def opaque48mem : X86MemOperand<"printopaquemem">;
def opaque80mem : X86MemOperand<"printopaquemem">;
def opaque512mem : X86MemOperand<"printopaquemem">;
-def i8mem : X86MemOperand<"printi8mem">;
-def i16mem : X86MemOperand<"printi16mem">;
-def i32mem : X86MemOperand<"printi32mem">;
-def i64mem : X86MemOperand<"printi64mem">;
-def i128mem : X86MemOperand<"printi128mem">;
-def i256mem : X86MemOperand<"printi256mem">;
-def f32mem : X86MemOperand<"printf32mem">;
-def f64mem : X86MemOperand<"printf64mem">;
-def f80mem : X86MemOperand<"printf80mem">;
-def f128mem : X86MemOperand<"printf128mem">;
-def f256mem : X86MemOperand<"printf256mem">;
+def i8mem : X86MemOperand<"printi8mem"> {
+ let ParserMatchClass = X86Mem8AsmOperand; }
+def i16mem : X86MemOperand<"printi16mem"> {
+ let ParserMatchClass = X86Mem16AsmOperand; }
+def i32mem : X86MemOperand<"printi32mem"> {
+ let ParserMatchClass = X86Mem32AsmOperand; }
+def i64mem : X86MemOperand<"printi64mem"> {
+ let ParserMatchClass = X86Mem64AsmOperand; }
+def i128mem : X86MemOperand<"printi128mem"> {
+ let ParserMatchClass = X86Mem128AsmOperand; }
+def i256mem : X86MemOperand<"printi256mem"> {
+ let ParserMatchClass = X86Mem256AsmOperand; }
+def f32mem : X86MemOperand<"printf32mem"> {
+ let ParserMatchClass = X86Mem32AsmOperand; }
+def f64mem : X86MemOperand<"printf64mem"> {
+ let ParserMatchClass = X86Mem64AsmOperand; }
+def f80mem : X86MemOperand<"printf80mem"> {
+ let ParserMatchClass = X86Mem80AsmOperand; }
+def f128mem : X86MemOperand<"printf128mem"> {
+ let ParserMatchClass = X86Mem128AsmOperand; }
+def f256mem : X86MemOperand<"printf256mem">{
+ let ParserMatchClass = X86Mem256AsmOperand; }
}
// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -289,7 +323,7 @@ def f256mem : X86MemOperand<"printf256mem">;
def i8mem_NOREX : Operand<i64> {
let PrintMethod = "printi8mem";
let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
- let ParserMatchClass = X86MemAsmOperand;
+ let ParserMatchClass = X86Mem8AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
@@ -303,7 +337,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>;
def i32mem_TC : Operand<i32> {
let PrintMethod = "printi32mem";
let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm);
- let ParserMatchClass = X86MemAsmOperand;
+ let ParserMatchClass = X86Mem32AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
@@ -314,7 +348,7 @@ def i64mem_TC : Operand<i64> {
let PrintMethod = "printi64mem";
let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
ptr_rc_tailcall, i32imm, i8imm);
- let ParserMatchClass = X86MemAsmOperand;
+ let ParserMatchClass = X86Mem64AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
@@ -470,11 +504,8 @@ def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
-
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
-def HasXMM : Predicate<"Subtarget->hasXMM()">;
-def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
@@ -489,15 +520,14 @@ def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
-def FPStackf32 : Predicate<"!Subtarget->hasXMM()">;
-def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def In32BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
AssemblerPredicate<"Mode64Bit">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
-def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
@@ -1513,6 +1543,7 @@ include "X86InstrMMX.td"
include "X86Instr3DNow.td"
include "X86InstrVMX.td"
+include "X86InstrSVM.td"
// System instructions.
include "X86InstrSystem.td"
@@ -1601,6 +1632,8 @@ def : MnemonicAlias<"fcmovna", "fcmovbe">;
def : MnemonicAlias<"fcmovae", "fcmovnb">;
def : MnemonicAlias<"fcomip", "fcompi">;
def : MnemonicAlias<"fildq", "fildll">;
+def : MnemonicAlias<"fistpq", "fistpll">;
+def : MnemonicAlias<"fisttpq", "fisttpll">;
def : MnemonicAlias<"fldcww", "fldcw">;
def : MnemonicAlias<"fnstcww", "fnstcw">;
def : MnemonicAlias<"fnstsww", "fnstsw">;
@@ -1822,20 +1855,20 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
// errors, since its encoding is the most compact.
def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
-// shld/shrd op,op -> shld op, op, 1
-def : InstAlias<"shldw $r1, $r2", (SHLD16rri8 GR16:$r1, GR16:$r2, 1)>;
-def : InstAlias<"shldl $r1, $r2", (SHLD32rri8 GR32:$r1, GR32:$r2, 1)>;
-def : InstAlias<"shldq $r1, $r2", (SHLD64rri8 GR64:$r1, GR64:$r2, 1)>;
-def : InstAlias<"shrdw $r1, $r2", (SHRD16rri8 GR16:$r1, GR16:$r2, 1)>;
-def : InstAlias<"shrdl $r1, $r2", (SHRD32rri8 GR32:$r1, GR32:$r2, 1)>;
-def : InstAlias<"shrdq $r1, $r2", (SHRD64rri8 GR64:$r1, GR64:$r2, 1)>;
-
-def : InstAlias<"shldw $mem, $reg", (SHLD16mri8 i16mem:$mem, GR16:$reg, 1)>;
-def : InstAlias<"shldl $mem, $reg", (SHLD32mri8 i32mem:$mem, GR32:$reg, 1)>;
-def : InstAlias<"shldq $mem, $reg", (SHLD64mri8 i64mem:$mem, GR64:$reg, 1)>;
-def : InstAlias<"shrdw $mem, $reg", (SHRD16mri8 i16mem:$mem, GR16:$reg, 1)>;
-def : InstAlias<"shrdl $mem, $reg", (SHRD32mri8 i32mem:$mem, GR32:$reg, 1)>;
-def : InstAlias<"shrdq $mem, $reg", (SHRD64mri8 i64mem:$mem, GR64:$reg, 1)>;
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shldw $r1, $r2", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shldl $r1, $r2", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shldq $r1, $r2", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
+def : InstAlias<"shrdw $r1, $r2", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shrdl $r1, $r2", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shrdq $r1, $r2", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
+
+def : InstAlias<"shldw $mem, $reg", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shldl $mem, $reg", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shldq $mem, $reg", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shrdw $mem, $reg", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shrdl $mem, $reg", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shrdq $mem, $reg", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
/* FIXME: This is disabled because the asm matcher is currently incapable of
* matching a fixed immediate like $1.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index b2d9fca..63f96b6 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -1,4 +1,4 @@
-//====- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -105,19 +105,23 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d> {
def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (Int SrcRC:$src))], d>;
+ [(set DstRC:$dst, (Int SrcRC:$src))],
+ IIC_DEFAULT, d>;
def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>;
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))],
+ IIC_DEFAULT, d>;
}
multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
PatFrag ld_frag, string asm, Domain d> {
def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
- asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>;
+ asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ IIC_DEFAULT, d>;
def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2), asm,
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>;
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ IIC_DEFAULT, d>;
}
//===----------------------------------------------------------------------===//
@@ -175,25 +179,25 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (x86mmx VR64:$src), addr:$dst)]>;
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
- "movdq2q\t{$src, $dst|$dst, $src}",
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (bitconvert
(i64 (vector_extract (v2i64 VR128:$src),
(iPTR 0))))))]>;
-def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
- "movq2dq\t{$src, $dst|$dst, $src}",
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (scalar_to_vector
(i64 (bitconvert (x86mmx VR64:$src))))))]>;
let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
- "movq2dq\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>;
-def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src),
- "movdq2q\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>;
def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movntq\t{$src, $dst|$dst, $src}",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 345f606..c6d1d19 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1,4 +1,4 @@
-//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,6 +13,126 @@
//
//===----------------------------------------------------------------------===//
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+ OpndItins s = arg_s;
+ OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+ InstrItinClass arg_ri> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+def SSE_ALU_F32S : OpndItins<
+ IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+ IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+
+def SSE_ALU_ITINS_S : SizeItins<
+ SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+def SSE_MUL_F32S : OpndItins<
+ IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+ IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_ITINS_S : SizeItins<
+ SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+def SSE_DIV_F32S : OpndItins<
+ IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+ IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_ITINS_S : SizeItins<
+ SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+def SSE_ALU_F32P : OpndItins<
+ IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+ IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+
+def SSE_ALU_ITINS_P : SizeItins<
+ SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+def SSE_MUL_F32P : OpndItins<
+ IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+ IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_ITINS_P : SizeItins<
+ SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+def SSE_DIV_F32P : OpndItins<
+ IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+ IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_ITINS_P : SizeItins<
+ SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+ IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_INTALU_ITINS_P : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+ IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+
+def SSE_INTMUL_ITINS_P : OpndItins<
+ IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+ IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+ IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+ IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 Instructions Classes
@@ -21,25 +141,27 @@
/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, X86MemOperand x86memop,
+ OpndItins itins,
bit Is2Addr = 1> {
let isCommutable = 1 in {
def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>;
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>;
}
def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>;
+ [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>;
}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
string asm, string SSEVer, string FPSizeStr,
Operand memopr, ComplexPattern mem_cpat,
+ OpndItins itins,
bit Is2Addr = 1> {
def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -47,33 +169,34 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))]>;
+ RC:$src1, RC:$src2))], itins.rr>;
def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, mem_cpat:$src2))]>;
+ RC:$src1, mem_cpat:$src2))], itins.rm>;
}
/// sse12_fp_packed - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag,
- Domain d, bit Is2Addr = 1> {
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>;
let mayLoad = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>;
+ [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+ itins.rm, d>;
}
/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
@@ -87,33 +210,33 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rr, d>;
+ pat_rr, IIC_DEFAULT, d>;
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rm, d>;
+ pat_rm, IIC_DEFAULT, d>;
}
/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
string asm, string SSEVer, string FPSizeStr,
X86MemOperand x86memop, PatFrag mem_frag,
- Domain d, bit Is2Addr = 1> {
+ Domain d, OpndItins itins, bit Is2Addr = 1> {
def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))], d>;
+ RC:$src1, RC:$src2))], IIC_DEFAULT, d>;
def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, (mem_frag addr:$src2)))], d>;
+ RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>;
}
//===----------------------------------------------------------------------===//
@@ -171,7 +294,7 @@ def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
// Bitcasts between 128-bit vector types. Return the original type since
// no instruction is needed for the conversion
-let Predicates = [HasXMMInt] in {
+let Predicates = [HasSSE2] in {
def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
@@ -244,9 +367,9 @@ let Predicates = [HasAVX] in {
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1 in {
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
- [(set FR32:$dst, fp32imm0)]>, Requires<[HasXMM]>;
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
- [(set FR64:$dst, fpimm0)]>, Requires<[HasXMMInt]>;
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
}
//===----------------------------------------------------------------------===//
@@ -279,16 +402,35 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
// JIT implementatioan, it does not expand the instructions below like
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isCodeGenOnly = 1, Predicates = [HasAVX] in {
+ isCodeGenOnly = 1 in {
+let Predicates = [HasAVX] in {
def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
}
+let Predicates = [HasAVX2], neverHasSideEffects = 1 in
+def AVX2_SET0 : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
+ []>, VEX_4V;
+}
+let Predicates = [HasAVX2], AddedComplexity = 5 in {
+ def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
+ def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
+ def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
+ def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
+}
// AVX has no support for 256-bit integer instructions, but since the 128-bit
// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+ (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
(SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
@@ -304,11 +446,11 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
- def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
- [(set VR128:$dst, (v4i32 immAllOnesV))]>;
let Predicates = [HasAVX] in
def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
+ def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
@@ -325,22 +467,25 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
//===----------------------------------------------------------------------===//
-class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> :
+class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
- [(set (vt VR128:$dst), (movl VR128:$src1, (scalar_to_vector RC:$src2)))]>;
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (scalar_to_vector RC:$src2))))],
+ IIC_SSE_MOV_S_RR>;
// Loading from memory automatically zeroing upper bits.
class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_pat, string OpcodeStr> :
SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))]>;
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM>;
// AVX
-def VMOVSSrr : sse12_move_rr<FR32, v4f32,
+def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
"movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
VEX_LIG;
-def VMOVSDrr : sse12_move_rr<FR64, v2f64,
+def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
"movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
VEX_LIG;
@@ -348,11 +493,13 @@ def VMOVSDrr : sse12_move_rr<FR64, v2f64,
let isCodeGenOnly = 1 in {
def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR32:$src2),
- "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_MOV_S_RR>,
XS, VEX_4V, VEX_LIG;
def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR64:$src2),
- "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_MOV_S_RR>,
XD, VEX_4V, VEX_LIG;
}
@@ -366,26 +513,30 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
"movss\t{$src, $dst|$dst, $src}",
- [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG;
+ [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+ XS, VEX, VEX_LIG;
def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
"movsd\t{$src, $dst|$dst, $src}",
- [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG;
+ [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+ XD, VEX, VEX_LIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
- def MOVSSrr : sse12_move_rr<FR32, v4f32,
+ def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
"movss\t{$src2, $dst|$dst, $src2}">, XS;
- def MOVSDrr : sse12_move_rr<FR64, v2f64,
+ def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
"movsd\t{$src2, $dst|$dst, $src2}">, XD;
// For the disassembler
let isCodeGenOnly = 1 in {
def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR32:$src2),
- "movss\t{$src2, $dst|$dst, $src2}", []>, XS;
+ "movss\t{$src2, $dst|$dst, $src2}", [],
+ IIC_SSE_MOV_S_RR>, XS;
def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR64:$src2),
- "movsd\t{$src2, $dst|$dst, $src2}", []>, XD;
+ "movsd\t{$src2, $dst|$dst, $src2}", [],
+ IIC_SSE_MOV_S_RR>, XD;
}
}
@@ -398,157 +549,14 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
"movss\t{$src, $dst|$dst, $src}",
- [(store FR32:$src, addr:$dst)]>;
+ [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
"movsd\t{$src, $dst|$dst, $src}",
- [(store FR64:$src, addr:$dst)]>;
+ [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
// Patterns
-let Predicates = [HasSSE1] in {
- let AddedComplexity = 15 in {
- // Extract the low 32-bit value from one vector and insert it into another.
- def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4f32 VR128:$src1),
- (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
- def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4i32 VR128:$src1),
- (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSS to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
- (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)),
- (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)),
- (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
- }
-
- let AddedComplexity = 20 in {
- // MOVSSrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
- }
-
- // Extract and store.
- def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSSmr addr:$dst,
- (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-
- // Shuffle with MOVSS
- def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
- (MOVSSrr VR128:$src1, FR32:$src2)>;
- def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4i32 VR128:$src1),
- (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr (v4f32 VR128:$src1),
- (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-}
-
-let Predicates = [HasSSE2] in {
- let AddedComplexity = 15 in {
- // Extract the low 64-bit value from one vector and insert it into another.
- def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2f64 VR128:$src1),
- (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
- def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2i64 VR128:$src1),
- (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-
- // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
- def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
- def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
-
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVSD to the lower bits.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
- }
-
- let AddedComplexity = 20 in {
- // MOVSDrm zeros the high parts of the register; represent this
- // with SUBREG_TO_REG.
- def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- def : Pat<(v2f64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
- }
-
- // Extract and store.
- def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
- addr:$dst),
- (MOVSDmr addr:$dst,
- (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
-
- // Shuffle with MOVSD
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
- (MOVSDrr VR128:$src1, FR64:$src2)>;
- def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2i64 VR128:$src1),
- (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr (v2f64 VR128:$src1),
- (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
- def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
- def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
-
- // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
- // is during lowering, where it's not possible to recognize the fold cause
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
- def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
- def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
-}
-
let Predicates = [HasAVX] in {
let AddedComplexity = 15 in {
- // Extract the low 32-bit value from one vector and insert it into another.
- def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
- (VMOVSSrr (v4f32 VR128:$src1),
- (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
- def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
- (VMOVSSrr (v4i32 VR128:$src1),
- (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-
- // Extract the low 64-bit value from one vector and insert it into another.
- def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
- (VMOVSDrr (v2f64 VR128:$src1),
- (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
- def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
- (VMOVSDrr (v2i64 VR128:$src1),
- (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-
- // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
- def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
- def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
-
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVS{S,D} to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
@@ -628,7 +636,12 @@ let Predicates = [HasAVX] in {
(VMOVSDrr (v2f64 (V_SET0)),
(EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>;
- // Extract and store.
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v4i64 VR256:$src), sub_sd)), sub_xmm)>;
+
+// Extract and store.
def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSSmr addr:$dst,
@@ -639,8 +652,6 @@ let Predicates = [HasAVX] in {
(EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
// Shuffle with VMOVSS
- def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
- (VMOVSSrr VR128:$src1, FR32:$src2)>;
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
(VMOVSSrr (v4i32 VR128:$src1),
(EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
@@ -649,18 +660,16 @@ let Predicates = [HasAVX] in {
(EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
// 256-bit variants
- def : Pat<(v8i32 (X86Movsd VR256:$src1, VR256:$src2)),
+ def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss),
(EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>;
- def : Pat<(v8f32 (X86Movsd VR256:$src1, VR256:$src2)),
+ def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss),
(EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>;
// Shuffle with VMOVSD
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
- (VMOVSDrr VR128:$src1, FR64:$src2)>;
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
(VMOVSDrr (v2i64 VR128:$src1),
(EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
@@ -703,6 +712,101 @@ let Predicates = [HasAVX] in {
sub_sd))>;
}
+let Predicates = [HasSSE1] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)),
+ (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSSrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+ def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst,
+ (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+ // Shuffle with MOVSS
+ def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr (v4i32 VR128:$src1),
+ (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+ def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+ (MOVSSrr (v4f32 VR128:$src1),
+ (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+}
+
+let Predicates = [HasSSE2] in {
+ let AddedComplexity = 15 in {
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSD to the lower bits.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ }
+
+ let AddedComplexity = 20 in {
+ // MOVSDrm zeros the high parts of the register; represent this
+ // with SUBREG_TO_REG.
+ def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+ def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+ def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+ def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+ def : Pat<(v2f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+ }
+
+ // Extract and store.
+ def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSDmr addr:$dst,
+ (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+ // Shuffle with MOVSD
+ def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr (v2i64 VR128:$src1),
+ (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr (v2f64 VR128:$src1),
+ (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+ def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+ def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+
+ // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+ // is during lowering, where it's not possible to recognize the fold cause
+ // it has two uses through a bitcast. One use disappears at isel time and the
+ // fold opportunity reappears.
+ def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
+ def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
+ def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+ def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
//===----------------------------------------------------------------------===//
@@ -710,93 +814,122 @@ let Predicates = [HasAVX] in {
multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d,
+ OpndItins itins,
bit IsReMaterializable = 1> {
let neverHasSideEffects = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>;
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>;
let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (ld_frag addr:$src))], d>;
+ [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>;
}
defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
- "movaps", SSEPackedSingle>, TB, VEX;
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB, VEX;
defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
- "movapd", SSEPackedDouble>, TB, OpSize, VEX;
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize, VEX;
defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
- "movups", SSEPackedSingle>, TB, VEX;
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB, VEX;
defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize, VEX;
defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
- "movaps", SSEPackedSingle>, TB, VEX;
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB, VEX;
defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
- "movapd", SSEPackedDouble>, TB, OpSize, VEX;
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize, VEX;
defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
- "movups", SSEPackedSingle>, TB, VEX;
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB, VEX;
defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
- "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize, VEX;
defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
- "movaps", SSEPackedSingle>, TB;
+ "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+ TB;
defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
- "movapd", SSEPackedDouble>, TB, OpSize;
+ "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+ TB, OpSize;
defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
- "movups", SSEPackedSingle>, TB;
+ "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+ TB;
defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, 0>, TB, OpSize;
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ TB, OpSize;
def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX;
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, VEX;
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v4f32 VR128:$src), addr:$dst)]>, VEX;
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v2f64 VR128:$src), addr:$dst)]>, VEX;
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX;
+ [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX;
+ [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v8f32 VR256:$src), addr:$dst)]>, VEX;
+ [(store (v8f32 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
+ [(store (v4f64 VR256:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>, VEX;
// For disassembler
let isCodeGenOnly = 1 in {
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
- "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
- "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>, VEX;
}
let Predicates = [HasAVX] in {
@@ -815,37 +948,42 @@ def : Pat<(v4f64 (X86vzmovl
}
-def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
-
-def : Pat<(int_x86_avx_loadu_pd_256 addr:$src), (VMOVUPDYrm addr:$src)>;
def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
(VMOVUPDYmr addr:$dst, VR256:$src)>;
def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVA_P_MR>;
def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
- [(store (v4f32 VR128:$src), addr:$dst)]>;
+ [(store (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
- [(store (v2f64 VR128:$src), addr:$dst)]>;
+ [(store (v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVU_P_MR>;
// For disassembler
let isCodeGenOnly = 1 in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movaps\t{$src, $dst|$dst, $src}", []>;
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movapd\t{$src, $dst|$dst, $src}", []>;
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movups\t{$src, $dst|$dst, $src}", []>;
+ "movups\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movupd\t{$src, $dst|$dst, $src}", []>;
+ "movupd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>;
}
let Predicates = [HasAVX] in {
@@ -862,44 +1000,9 @@ let Predicates = [HasSSE2] in
def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
(MOVUPDmr addr:$dst, VR128:$src)>;
-// Use movaps / movups for SSE integer load / store (one byte shorter).
-// The instructions selected below are then converted to MOVDQA/MOVDQU
-// during the SSE domain pass.
-let Predicates = [HasSSE1] in {
- def : Pat<(alignedloadv4i32 addr:$src),
- (MOVAPSrm addr:$src)>;
- def : Pat<(loadv4i32 addr:$src),
- (MOVUPSrm addr:$src)>;
- def : Pat<(alignedloadv2i64 addr:$src),
- (MOVAPSrm addr:$src)>;
- def : Pat<(loadv2i64 addr:$src),
- (MOVUPSrm addr:$src)>;
-
- def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>;
- def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>;
- def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>;
- def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>;
- def : Pat<(store (v2i64 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>;
- def : Pat<(store (v4i32 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>;
- def : Pat<(store (v8i16 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>;
- def : Pat<(store (v16i8 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>;
-}
-
// Use vmovaps/vmovups for AVX integer load/store.
let Predicates = [HasAVX] in {
// 128-bit load/store
- def : Pat<(alignedloadv4i32 addr:$src),
- (VMOVAPSrm addr:$src)>;
- def : Pat<(loadv4i32 addr:$src),
- (VMOVUPSrm addr:$src)>;
def : Pat<(alignedloadv2i64 addr:$src),
(VMOVAPSrm addr:$src)>;
def : Pat<(loadv2i64 addr:$src),
@@ -927,10 +1030,6 @@ let Predicates = [HasAVX] in {
(VMOVAPSYrm addr:$src)>;
def : Pat<(loadv4i64 addr:$src),
(VMOVUPSYrm addr:$src)>;
- def : Pat<(alignedloadv8i32 addr:$src),
- (VMOVAPSYrm addr:$src)>;
- def : Pat<(loadv8i32 addr:$src),
- (VMOVUPSYrm addr:$src)>;
def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
@@ -949,36 +1048,71 @@ let Predicates = [HasAVX] in {
(VMOVUPSYmr addr:$dst, VR256:$src)>;
}
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [HasSSE1] in {
+ def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+ def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+ def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
let neverHasSideEffects = 1 in {
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
- "movaps\t{$src, $dst|$dst, $src}", []>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
- "movapd\t{$src, $dst|$dst, $src}", []>;
def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
- "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
- "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>, VEX;
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movaps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movapd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
}
// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
- "movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
- "movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
let isCodeGenOnly = 1 in {
def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX;
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX;
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>, VEX;
}
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+ IIC_SSE_MOVA_P_RM>;
}
//===----------------------------------------------------------------------===//
@@ -986,94 +1120,68 @@ let isCodeGenOnly = 1 in {
//===----------------------------------------------------------------------===//
multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
- PatFrag mov_frag, string base_opc,
- string asm_opr> {
+ SDNode psnode, SDNode pdnode, string base_opc,
+ string asm_opr, InstrItinClass itin> {
def PSrm : PI<opc, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
!strconcat(base_opc, "s", asm_opr),
[(set RC:$dst,
- (mov_frag RC:$src1,
+ (psnode RC:$src1,
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
- SSEPackedSingle>, TB;
+ itin, SSEPackedSingle>, TB;
def PDrm : PI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, f64mem:$src2),
!strconcat(base_opc, "d", asm_opr),
- [(set RC:$dst, (v2f64 (mov_frag RC:$src1,
+ [(set RC:$dst, (v2f64 (pdnode RC:$src1,
(scalar_to_vector (loadf64 addr:$src2)))))],
- SSEPackedDouble>, TB, OpSize;
+ itin, SSEPackedDouble>, TB, OpSize;
}
let AddedComplexity = 20 in {
- defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+ defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ IIC_SSE_MOV_LH>, VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
- defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
- "\t{$src2, $dst|$dst, $src2}">;
+ defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
+ "\t{$src2, $dst|$dst, $src2}",
+ IIC_SSE_MOV_LH>;
}
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>, VEX;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (v2f64 VR128:$src),
- (iPTR 0))), addr:$dst)]>, VEX;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>, VEX;
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (v2f64 VR128:$src),
- (iPTR 0))), addr:$dst)]>;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>;
let Predicates = [HasAVX] in {
- let AddedComplexity = 20 in {
- // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
- def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
- (VMOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
- (VMOVLPSrm VR128:$src1, addr:$src2)>;
- // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
- def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
- }
-
- // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
- def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (VMOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
- VR128:$src2)), addr:$src1),
- (VMOVLPSmr addr:$src1, VR128:$src2)>;
-
- // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
- def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (VMOVLPDmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (VMOVLPDmr addr:$src1, VR128:$src2)>;
-
// Shuffle with VMOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(X86Movlps VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (VMOVLPSrm VR128:$src1, addr:$src2)>;
// Shuffle with VMOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Movlpd VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
// Store patterns
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
@@ -1091,23 +1199,10 @@ let Predicates = [HasAVX] in {
}
let Predicates = [HasSSE1] in {
- let AddedComplexity = 20 in {
- // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
- def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>;
- }
-
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
(iPTR 0))), addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
- VR128:$src2)), addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
// Shuffle with MOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
@@ -1115,9 +1210,6 @@ let Predicates = [HasSSE1] in {
def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlps VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (MOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(X86Movlps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
@@ -1132,28 +1224,11 @@ let Predicates = [HasSSE1] in {
}
let Predicates = [HasSSE2] in {
- let AddedComplexity = 20 in {
- // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
- def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
- }
-
- // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
- def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>;
-
// Shuffle with MOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Movlpd VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
// Store patterns
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
@@ -1169,12 +1244,14 @@ let Predicates = [HasSSE2] in {
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
- defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+ defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ IIC_SSE_MOV_LH>, VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
- defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
- "\t{$src2, $dst|$dst, $src2}">;
+ defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
+ "\t{$src2, $dst|$dst, $src2}",
+ IIC_SSE_MOV_LH>;
}
// v2f64 extract element 1 is always custom lowered to unpack high to low
@@ -1182,33 +1259,28 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
- (unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (undef)), (iPTR 0))), addr:$dst)]>,
- VEX;
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
- (v2f64 (unpckh VR128:$src, (undef))),
- (iPTR 0))), addr:$dst)]>,
- VEX;
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
- (unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (undef)), (iPTR 0))), addr:$dst)]>;
+ (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (bc_v2f64 (v4f32 VR128:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
- (v2f64 (unpckh VR128:$src, (undef))),
- (iPTR 0))), addr:$dst)]>;
+ (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
let Predicates = [HasAVX] in {
// VMOVHPS patterns
- def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
- (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (VMOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(VMOVHPSrm VR128:$src1, addr:$src2)>;
@@ -1217,65 +1289,32 @@ let Predicates = [HasAVX] in {
(VMOVHPSrm VR128:$src1, addr:$src2)>;
// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
- // is during lowering, where it's not possible to recognize the load fold cause
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
-
- // FIXME: This should be matched by a X86Movhpd instead. Same as above
- def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (VMOVHPDrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (f64 (vector_extract
- (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
- (VMOVHPSmr addr:$dst, VR128:$src)>;
- def : Pat<(store (f64 (vector_extract
- (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
- (VMOVHPDmr addr:$dst, VR128:$src)>;
}
let Predicates = [HasSSE1] in {
// MOVHPS patterns
- def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
- (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
- def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (f64 (vector_extract
- (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst),
- (MOVHPSmr addr:$dst, VR128:$src)>;
}
let Predicates = [HasSSE2] in {
// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
- // is during lowering, where it's not possible to recognize the load fold cause
- // it has two uses through a bitcast. One use disappears at isel time and the
- // fold opportunity reappears.
+ // is during lowering, where it's not possible to recognize the load fold
+ // cause it has two uses through a bitcast. One use disappears at isel time
+ // and the fold opportunity reappears.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
-
- // FIXME: This should be matched by a X86Movhpd instead. Same as above
- def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (MOVHPDrm VR128:$src1, addr:$src2)>;
-
- // Store patterns
- def : Pat<(store (f64 (vector_extract
- (v2f64 (X86Unpckh VR128:$src, (undef))), (iPTR 0))),addr:$dst),
- (MOVHPDmr addr:$dst, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -1287,13 +1326,15 @@ let AddedComplexity = 20 in {
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>,
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
VEX_4V;
def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>,
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>,
VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
@@ -1301,86 +1342,36 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
+ (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>;
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+ (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+ IIC_SSE_MOV_LH>;
}
let Predicates = [HasAVX] in {
// MOVLHPS patterns
- let AddedComplexity = 20 in {
- def : Pat<(v4f32 (movddup VR128:$src, (undef))),
- (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
- def : Pat<(v2i64 (movddup VR128:$src, (undef))),
- (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
-
- // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
- def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
- (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
- }
- def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
// MOVHLPS patterns
- let AddedComplexity = 20 in {
- // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
- def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
- (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
-
- // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
- def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
- (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
- def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
- (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
- }
-
- def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
- (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
(VMOVHLPSrr VR128:$src1, VR128:$src2)>;
}
let Predicates = [HasSSE1] in {
// MOVLHPS patterns
- let AddedComplexity = 20 in {
- def : Pat<(v4f32 (movddup VR128:$src, (undef))),
- (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
- def : Pat<(v2i64 (movddup VR128:$src, (undef))),
- (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
-
- // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
- def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
- }
- def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
// MOVHLPS patterns
- let AddedComplexity = 20 in {
- // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
- def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
- (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
- // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
- def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
- (MOVHLPSrr VR128:$src1, VR128:$src1)>;
- def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
- (MOVHLPSrr VR128:$src1, VR128:$src1)>;
- }
-
- def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
- (MOVHLPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
(MOVHLPSrr VR128:$src1, VR128:$src2)>;
}
@@ -1389,70 +1380,97 @@ let Predicates = [HasSSE1] in {
// SSE 1 & 2 - Conversion Instructions
//===----------------------------------------------------------------------===//
+def SSE_CVT_PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+def SSE_CVT_PS : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+def SSE_CVT_Scalar : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+def SSE_CVT_SS2SI_32 : OpndItins<
+ IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+def SSE_CVT_SS2SI_64 : OpndItins<
+ IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+def SSE_CVT_SD2SI : OpndItins<
+ IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm> {
+ string asm, OpndItins itins> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (OpNode SrcRC:$src))]>;
+ [(set DstRC:$dst, (OpNode SrcRC:$src))],
+ itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>;
-}
-
-multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>;
- let mayLoad = 1 in
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>;
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+ itins.rm>;
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, Domain d> {
+ string asm, Domain d, OpndItins itins> {
def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [(set DstRC:$dst, (OpNode SrcRC:$src))], d>;
+ [(set DstRC:$dst, (OpNode SrcRC:$src))],
+ itins.rr, d>;
def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], d>;
+ [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+ itins.rm, d>;
}
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
+} // neverHasSideEffects = 1
}
defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
- VEX_LIG;
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>,
+ XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
- VEX_W, VEX_LIG;
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX,
- VEX_LIG;
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}">, XD,
- VEX, VEX_W, VEX_LIG;
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W, VEX_LIG;
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
-defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS,
- VEX_4V, VEX_LIG;
-defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS,
- VEX_4V, VEX_W, VEX_LIG;
-defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD,
- VEX_4V, VEX_LIG;
-defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
- VEX_4V, VEX_LIG;
-defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
- VEX_4V, VEX_W, VEX_LIG;
-
-let Predicates = [HasAVX] in {
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">,
+ XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+ XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+ XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+ XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [HasAVX], AddedComplexity = 1 in {
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1473,169 +1491,185 @@ let Predicates = [HasAVX] in {
}
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
- "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
- "cvttsd2si\t{$src, $dst|$dst, $src}">, XD;
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SD2SI>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
- "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS;
+ "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS;
defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
- "cvtsi2ss{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
- "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD;
+ "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD;
defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
- "cvtsi2sd{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_Scalar>, XD, REX_W;
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
- string asm> {
+ string asm, OpndItins itins> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int SrcRC:$src))]>;
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>;
+ [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm>;
}
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
- PatFrag ld_frag, string asm, bit Is2Addr = 1> {
+ PatFrag ld_frag, string asm, OpndItins itins,
+ bit Is2Addr = 1> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>;
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ itins.rm>;
}
-defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
- f128mem, load, "cvtsd2si">, XD, VEX;
-defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
- int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
- XD, VEX, VEX_W;
-
-// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
-// Get rid of this hack or rename the intrinsics, there are several
-// intructions that only match with the intrinsic form, why create duplicates
-// to let them be recognized by the assembler?
-defm VCVTSD2SI : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
- "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG;
-defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
- "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W,
- VEX_LIG;
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+ f128mem, load, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+ int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si",
+ SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
- f128mem, load, "cvtsd2si{l}">, XD;
+ f128mem, load, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD;
defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
- f128mem, load, "cvtsd2si{q}">, XD, REX_W;
+ f128mem, load, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V;
defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
+ int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss",
+ SSE_CVT_Scalar, 0>, XS, VEX_4V,
VEX_W;
defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd",
+ SSE_CVT_Scalar, 0>, XD, VEX_4V;
defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
+ int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd",
+ SSE_CVT_Scalar, 0>, XD,
VEX_4V, VEX_W;
let Constraints = "$src1 = $dst" in {
defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32,
- "cvtsi2ss">, XS;
+ "cvtsi2ss", SSE_CVT_Scalar>, XS;
defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64,
- "cvtsi2ss{q}">, XS, REX_W;
+ "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32,
- "cvtsi2sd">, XD;
+ "cvtsi2sd", SSE_CVT_Scalar>, XD;
defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64,
- "cvtsi2sd">, XD, REX_W;
+ "cvtsi2sd", SSE_CVT_Scalar>, XD, REX_W;
}
/// SSE 1 Only
// Aliases for intrinsics
defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
- f32mem, load, "cvttss2si">, XS, VEX;
+ f32mem, load, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX;
defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, f32mem, load,
- "cvttss2si">, XS, VEX, VEX_W;
+ "cvttss2si", SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W;
defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
- f128mem, load, "cvttsd2si">, XD, VEX;
+ f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX;
defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, f128mem, load,
- "cvttsd2si">, XD, VEX, VEX_W;
+ "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W;
defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
- f32mem, load, "cvttss2si">, XS;
+ f32mem, load, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS;
defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, f32mem, load,
- "cvttss2si{q}">, XS, REX_W;
+ "cvttss2si{q}", SSE_CVT_SS2SI_64>,
+ XS, REX_W;
defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
- f128mem, load, "cvttsd2si">, XD;
+ f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
+ XD;
defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, f128mem, load,
- "cvttsd2si{q}">, XD, REX_W;
+ "cvttsd2si{q}", SSE_CVT_SD2SI>,
+ XD, REX_W;
let Pattern = []<dag> in {
defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
- "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS,
- VEX, VEX_LIG;
+ "cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
- "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
- VEX_W, VEX_LIG;
+ "cvtss2si\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle>, TB, VEX;
+ SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle>, TB, VEX;
+ SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
}
let Pattern = []<dag> in {
defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
- "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
+ "cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_32>, XS;
defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
- "cvtss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
+ SSEPackedSingle, SSE_CVT_PS>,
+ TB; /* PD SSE3 form is avaiable */
}
-let Predicates = [HasSSE1] in {
+let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse_cvtss2si VR128:$src),
- (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+ (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
- (CVTSS2SIrm addr:$src)>;
+ (VCVTSS2SIrm addr:$src)>;
def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
- (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+ (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
- (CVTSS2SI64rm addr:$src)>;
+ (VCVTSS2SI64rm addr:$src)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasSSE1] in {
def : Pat<(int_x86_sse_cvtss2si VR128:$src),
- (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+ (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
- (VCVTSS2SIrm addr:$src)>;
+ (CVTSS2SIrm addr:$src)>;
def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
- (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+ (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
- (VCVTSS2SI64rm addr:$src)>;
+ (CVTSS2SI64rm addr:$src)>;
}
/// SSE 2 Only
@@ -1643,43 +1677,51 @@ let Predicates = [HasAVX] in {
// Convert scalar double to scalar single
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR64:$src1, FR64:$src2),
- "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V, VEX_LIG;
+ "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR64:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
Requires<[HasAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround FR64:$src))]>;
+ [(set FR32:$dst, (fround FR64:$src))],
+ IIC_SSE_CVT_Scalar_RR>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+ IIC_SSE_CVT_Scalar_RM>,
+ XD,
Requires<[HasSSE2, OptForSize]>;
defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
- int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>,
+ int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
+ SSE_CVT_Scalar, 0>,
XS, VEX_4V;
let Constraints = "$src1 = $dst" in
defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
- int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS;
+ int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
+ SSE_CVT_Scalar>, XS;
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR32:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
+ [], IIC_SSE_CVT_Scalar_RR>,
+ XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR32:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
+ [], IIC_SSE_CVT_Scalar_RM>,
+ XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
let Predicates = [HasAVX] in {
def : Pat<(f64 (fextend FR32:$src)),
@@ -1696,11 +1738,13 @@ def : Pat<(extloadf32 addr:$src),
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+ [(set FR64:$dst, (fextend FR32:$src))],
+ IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[HasSSE2]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+ [(set FR64:$dst, (extloadf32 addr:$src))],
+ IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[HasSSE2, OptForSize]>;
// extload f32 -> f64. This matches load+fextend because we have a hack in
@@ -1717,26 +1761,30 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
- VR128:$src2))]>, XS, VEX_4V,
+ VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V,
Requires<[HasAVX]>;
def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
- (load addr:$src2)))]>, XS, VEX_4V,
+ (load addr:$src2)))],
+ IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V,
Requires<[HasAVX]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
- VR128:$src2))]>, XS,
+ VR128:$src2))],
+ IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[HasSSE2]>;
def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
- (load addr:$src2)))]>, XS,
+ (load addr:$src2)))],
+ IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[HasSSE2]>;
}
@@ -1744,216 +1792,275 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
// SSE2 instructions without OpSize prefix
def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
+ IIC_SSE_CVT_PS_RR>,
TB, VEX, Requires<[HasAVX]>;
def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps
- (bitconvert (memopv2i64 addr:$src))))]>,
+ (bitconvert (memopv2i64 addr:$src))))],
+ IIC_SSE_CVT_PS_RM>,
TB, VEX, Requires<[HasAVX]>;
def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
+ IIC_SSE_CVT_PS_RR>,
TB, Requires<[HasSSE2]>;
def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"cvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps
- (bitconvert (memopv2i64 addr:$src))))]>,
+ (bitconvert (memopv2i64 addr:$src))))],
+ IIC_SSE_CVT_PS_RM>,
TB, Requires<[HasSSE2]>;
// FIXME: why the non-intrinsic version is described as SSE3?
// SSE2 instructions with XS prefix
def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
XS, VEX, Requires<[HasAVX]>;
def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd
- (bitconvert (memopv2i64 addr:$src))))]>,
+ (bitconvert (memopv2i64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>,
XS, VEX, Requires<[HasAVX]>;
def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
XS, Requires<[HasSSE2]>;
def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd
- (bitconvert (memopv2i64 addr:$src))))]>,
+ (bitconvert (memopv2i64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>,
XS, Requires<[HasSSE2]>;
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RR>, VEX;
def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RM>, VEX;
def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RR>, VEX;
def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RM>, VEX;
def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RR>;
def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+ "cvtps2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PS_RM>;
def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>,
VEX;
def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
(ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq
- (memop addr:$src)))]>, VEX;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX;
def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>;
def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq
- (memop addr:$src)))]>;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PS_RM>;
// SSE2 packed instructions with XD prefix
def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
XD, VEX, Requires<[HasAVX]>;
def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq
- (memop addr:$src)))]>,
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
XD, VEX, Requires<[HasAVX]>;
def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
XD, Requires<[HasSSE2]>;
def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq
- (memop addr:$src)))]>,
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
XD, Requires<[HasSSE2]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-let mayLoad = 1 in
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX;
def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX;
def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-let mayLoad = 1 in
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ IIC_SSE_CVT_PS_RR>, VEX;
def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+ (memopv8f32 addr:$src)))],
+ IIC_SSE_CVT_PS_RM>, VEX;
+
def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq VR128:$src))]>;
+ (int_x86_sse2_cvttps2dq VR128:$src))],
+ IIC_SSE_CVT_PS_RR>;
def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
-
-def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvttps2dq VR128:$src))]>,
- XS, VEX, Requires<[HasAVX]>;
-def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttps2dq
- (memop addr:$src)))]>,
- XS, VEX, Requires<[HasAVX]>;
-
-let Predicates = [HasSSE2] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (Int_CVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (CVTTPS2DQrr VR128:$src)>;
-}
+ (int_x86_sse2_cvttps2dq (memop addr:$src)))],
+ IIC_SSE_CVT_PS_RM>;
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
(Int_VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (Int_VCVTDQ2PSrm addr:$src)>;
+
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(VCVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (VCVTTPS2DQrm addr:$src)>;
+
def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
(VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+ (VCVTDQ2PSYrm addr:$src)>;
+
def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
(VCVTTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+ (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (Int_CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (Int_CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (CVTTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+ (CVTTPS2DQrm addr:$src)>;
}
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX;
+ (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>, VEX;
let isCodeGenOnly = 1 in
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memop addr:$src)))]>, VEX;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX;
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+ IIC_SSE_CVT_PD_RR>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memop addr:$src)))]>;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttpd2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
- "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+ "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
// Convert packed single to packed double
let Predicates = [HasAVX] in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
+ "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, TB, VEX;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
+ "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, TB, VEX;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
+ "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, TB, VEX;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
+ "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, TB, VEX;
}
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+ "cvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, TB;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+ "cvtps2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, TB;
def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
TB, VEX, Requires<[HasAVX]>;
def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd
- (load addr:$src)))]>,
+ (load addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
TB, VEX, Requires<[HasAVX]>;
def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+ IIC_SSE_CVT_PD_RR>,
TB, Requires<[HasSSE2]>;
def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd
- (load addr:$src)))]>,
+ (load addr:$src)))],
+ IIC_SSE_CVT_PD_RM>,
TB, Requires<[HasSSE2]>;
// Convert packed double to packed single
@@ -1961,42 +2068,54 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX;
+ "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
- "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+ "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>;
def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>;
def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
(ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps
- (memop addr:$src)))]>;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>;
def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ IIC_SSE_CVT_PD_RR>;
def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps
- (memop addr:$src)))]>;
+ (memop addr:$src)))],
+ IIC_SSE_CVT_PD_RM>;
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
@@ -2026,11 +2145,6 @@ def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
(VCVTTPD2DQYrm addr:$src)>;
-def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src),
- (VCVTTPS2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
- (VCVTTPS2DQYrm addr:$src)>;
-
// Match fround and fextend for 128/256-bit conversions
def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
(VCVTPD2PSYrr VR256:$src)>;
@@ -2049,69 +2163,84 @@ def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
SDNode OpNode, ValueType VT, PatFrag ld_frag,
- string asm, string asm_alt> {
+ string asm, string asm_alt,
+ OpndItins itins> {
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>;
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+ itins.rr>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), imm:$cc))]>;
+ (ld_frag addr:$src2), imm:$cc))],
+ itins.rm>;
// Accept explicit immediate argument form instead of comparison code.
let neverHasSideEffects = 1 in {
def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>;
+ (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RR>;
let mayLoad = 1 in
def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>;
+ (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
+ IIC_SSE_ALU_F32S_RM>;
}
}
defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
"cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S>,
XS, VEX_4V, VEX_LIG;
defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
"cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SSE_ALU_F32S>, // same latency as 32 bit compare
XD, VEX_4V, VEX_LIG;
let Constraints = "$src1 = $dst" in {
defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
"cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
XS;
defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
"cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
- "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SSE_ALU_F32S>, // same latency as 32 bit compare
XD;
}
multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
- Intrinsic Int, string asm> {
+ Intrinsic Int, string asm, OpndItins itins> {
def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, SSECC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, imm:$cc))]>;
+ VR128:$src, imm:$cc))],
+ itins.rr>;
def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f32mem:$src, SSECC:$cc), asm,
+ (ins VR128:$src1, x86memop:$src, SSECC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- (load addr:$src), imm:$cc))]>;
+ (load addr:$src), imm:$cc))],
+ itins.rm>;
}
// Aliases to match intrinsics which expect XMM operand(s).
defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
+ "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S>,
XS, VEX_4V;
defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
+ "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ SSE_ALU_F32S>, // same latency as f32
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
- "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS;
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S>, XS;
defm Int_CMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
- "cmp${cc}sd\t{$src, $dst|$dst, $src}">, XD;
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ SSE_ALU_F32S>, // same latency as f32
+ XD;
}
@@ -2121,11 +2250,13 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
PatFrag ld_frag, string OpcodeStr, Domain d> {
def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
- [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], d>;
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR, d>;
def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
- (ld_frag addr:$src2)))], d>;
+ (ld_frag addr:$src2)))],
+ IIC_SSE_COMIS_RM, d>;
}
let Defs = [EFLAGS] in {
@@ -2182,19 +2313,21 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
let isAsmParserOnly = 1 in {
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
- [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>;
+ [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+ IIC_SSE_CMPP_RR, d>;
def rmi : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm,
- [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>;
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
+ [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+ IIC_SSE_CMPP_RM, d>;
}
// Accept explicit immediate argument form instead of comparison code.
def rri_alt : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
- asm_alt, [], d>;
+ asm_alt, [], IIC_SSE_CMPP_RR, d>;
def rmi_alt : PIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc),
- asm_alt, [], d>;
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+ asm_alt, [], IIC_SSE_CMPP_RM, d>;
}
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
@@ -2224,40 +2357,40 @@ let Constraints = "$src1 = $dst" in {
SSEPackedDouble>, TB, OpSize;
}
-let Predicates = [HasSSE1] in {
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
- (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
- (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [HasSSE2] in {
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
- (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
- (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-}
-
let Predicates = [HasAVX] in {
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
(VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
(VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
(VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
(VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
(VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
(VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
(VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
(VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
}
+let Predicates = [HasSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Shuffle Instructions
//===----------------------------------------------------------------------===//
@@ -2267,14 +2400,14 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
Domain d, bit IsConvertibleToThreeAddress = 0> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm,
- [(set RC:$dst, (vt (shufp:$src3
- RC:$src1, (mem_frag addr:$src2))))], d>;
+ (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, i8imm:$src3), asm,
- [(set RC:$dst,
- (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>;
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], IIC_SSE_SHUFP, d>;
}
defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2297,133 +2430,52 @@ let Constraints = "$src1 = $dst" in {
TB;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SSEPackedDouble>, TB, OpSize;
-}
-
-let Predicates = [HasSSE1] in {
- def : Pat<(v4f32 (X86Shufps VR128:$src1,
- (memopv4f32 addr:$src2), (i8 imm:$imm))),
- (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufps VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
- (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
- // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
- // fall back to this for SSE1)
- def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
- (SHUFPSrri VR128:$src2, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special unary SHUFPSrri case.
- def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
- (SHUFPSrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
-}
-
-let Predicates = [HasSSE2] in {
- // Special binary v4i32 shuffle cases with SHUFPS.
- def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
- (SHUFPSrri VR128:$src1, VR128:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (SHUFPSrmi VR128:$src1, addr:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special unary SHUFPDrri cases.
- def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
- (SHUFPDrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
- (SHUFPDrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special binary v2i64 shuffle cases using SHUFPDrri.
- def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
- (SHUFPDrri VR128:$src1, VR128:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Generic SHUFPD patterns
- def : Pat<(v2f64 (X86Shufps VR128:$src1,
- (memopv2f64 addr:$src2), (i8 imm:$imm))),
- (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
- def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+ memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
+ TB, OpSize;
}
let Predicates = [HasAVX] in {
- def : Pat<(v4f32 (X86Shufps VR128:$src1,
- (memopv4f32 addr:$src2), (i8 imm:$imm))),
- (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufps VR128:$src1,
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
(bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
(VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
- // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
- // fall back to this for SSE1)
- def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
- (VSHUFPSrri VR128:$src2, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special unary SHUFPSrri case.
- def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
- (VSHUFPSrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special binary v4i32 shuffle cases with SHUFPS.
- def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
- (VSHUFPSrri VR128:$src1, VR128:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (VSHUFPSrmi VR128:$src1, addr:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special unary SHUFPDrri cases.
- def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
- (VSHUFPDrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
- (VSHUFPDrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
- // Special binary v2i64 shuffle cases using SHUFPDrri.
- def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
- (VSHUFPDrri VR128:$src1, VR128:$src2,
- (SHUFFLE_get_shuf_imm VR128:$src3))>;
-
- def : Pat<(v2f64 (X86Shufps VR128:$src1,
- (memopv2f64 addr:$src2), (i8 imm:$imm))),
+
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
(VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
- def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
// 256-bit patterns
- def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v8i32 (X86Shufps VR256:$src1,
+ def : Pat<(v8i32 (X86Shufp VR256:$src1,
(bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
(VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v8f32 (X86Shufps VR256:$src1,
- (memopv8f32 addr:$src2), (i8 imm:$imm))),
- (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
-
- def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v4i64 (X86Shufpd VR256:$src1,
+ def : Pat<(v4i64 (X86Shufp VR256:$src1,
(memopv4i64 addr:$src2), (i8 imm:$imm))),
(VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
- def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v4f64 (X86Shufpd VR256:$src1,
- (memopv4f64 addr:$src2), (i8 imm:$imm))),
- (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+let Predicates = [HasSSE1] in {
+ def : Pat<(v4i32 (X86Shufp VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+ (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasSSE2] in {
+ // Generic SHUFPD patterns
+ def : Pat<(v2i64 (X86Shufp VR128:$src1,
+ (memopv2i64 addr:$src2), (i8 imm:$imm))),
+ (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+ def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
}
//===----------------------------------------------------------------------===//
@@ -2431,143 +2483,80 @@ let Predicates = [HasAVX] in {
//===----------------------------------------------------------------------===//
/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
-multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
PatFrag mem_frag, RegisterClass RC,
X86MemOperand x86memop, string asm,
Domain d> {
def rr : PI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
asm, [(set RC:$dst,
- (vt (OpNode RC:$src1, RC:$src2)))], d>;
+ (vt (OpNode RC:$src1, RC:$src2)))],
+ IIC_SSE_UNPCK, d>;
def rm : PI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
asm, [(set RC:$dst,
(vt (OpNode RC:$src1,
- (mem_frag addr:$src2))))], d>;
-}
-
-let AddedComplexity = 10 in {
- defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
- VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, TB, VEX_4V;
- defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
- VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, TB, OpSize, VEX_4V;
- defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
- VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, TB, VEX_4V;
- defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
- VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, TB, OpSize, VEX_4V;
-
- defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
- VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, TB, VEX_4V;
- defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
- VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, TB, OpSize, VEX_4V;
- defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
- VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, TB, VEX_4V;
- defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
- VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, TB, OpSize, VEX_4V;
-
- let Constraints = "$src1 = $dst" in {
- defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
- VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, TB;
- defm UNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
- VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble>, TB, OpSize;
- defm UNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
- VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, TB;
- defm UNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
- VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble>, TB, OpSize;
- } // Constraints = "$src1 = $dst"
-} // AddedComplexity
+ (mem_frag addr:$src2))))],
+ IIC_SSE_UNPCK, d>;
+}
+
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+ VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+ VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+ VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+ VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ SSEPackedDouble>, TB, OpSize, VEX_4V;
-let Predicates = [HasSSE1] in {
- def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))),
- (UNPCKLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)),
- (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))),
- (UNPCKHPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)),
- (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
-}
-
-let Predicates = [HasSSE2] in {
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))),
- (UNPCKLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)),
- (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))),
- (UNPCKHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)),
- (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
+let Constraints = "$src1 = $dst" in {
+ defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+ VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, TB;
+ defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+ VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, TB, OpSize;
+ defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+ VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+ SSEPackedSingle>, TB;
+ defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+ VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ SSEPackedDouble>, TB, OpSize;
+} // Constraints = "$src1 = $dst"
+let Predicates = [HasAVX], AddedComplexity = 1 in {
// FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
// problem is during lowering, where it's not possible to recognize the load
// fold cause it has two uses through a bitcast. One use disappears at isel
// time and the fold opportunity reappears.
def : Pat<(v2f64 (X86Movddup VR128:$src)),
- (UNPCKLPDrr VR128:$src, VR128:$src)>;
-
- let AddedComplexity = 10 in
- def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
- (UNPCKLPDrr VR128:$src, VR128:$src)>;
+ (VUNPCKLPDrr VR128:$src, VR128:$src)>;
}
-let Predicates = [HasAVX] in {
- def : Pat<(v4f32 (X86Unpckl VR128:$src1, (memopv4f32 addr:$src2))),
- (VUNPCKLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4f32 (X86Unpckl VR128:$src1, VR128:$src2)),
- (VUNPCKLPSrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4f32 (X86Unpckh VR128:$src1, (memopv4f32 addr:$src2))),
- (VUNPCKHPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4f32 (X86Unpckh VR128:$src1, VR128:$src2)),
- (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v8f32 (X86Unpckl VR256:$src1, (memopv8f32 addr:$src2))),
- (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v8f32 (X86Unpckl VR256:$src1, VR256:$src2)),
- (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v8f32 (X86Unpckh VR256:$src1, (memopv8f32 addr:$src2))),
- (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v8f32 (X86Unpckh VR256:$src1, VR256:$src2)),
- (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, (memopv2f64 addr:$src2))),
- (VUNPCKLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, VR128:$src2)),
- (VUNPCKLPDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2f64 (X86Unpckh VR128:$src1, (memopv2f64 addr:$src2))),
- (VUNPCKHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckh VR128:$src1, VR128:$src2)),
- (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v4f64 (X86Unpckl VR256:$src1, (memopv4f64 addr:$src2))),
- (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v4f64 (X86Unpckl VR256:$src1, VR256:$src2)),
- (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v4f64 (X86Unpckh VR256:$src1, (memopv4f64 addr:$src2))),
- (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v4f64 (X86Unpckh VR256:$src1, VR256:$src2)),
- (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-
+let Predicates = [HasSSE2] in {
// FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
// problem is during lowering, where it's not possible to recognize the load
// fold cause it has two uses through a bitcast. One use disappears at isel
// time and the fold opportunity reappears.
def : Pat<(v2f64 (X86Movddup VR128:$src)),
- (VUNPCKLPDrr VR128:$src, VR128:$src)>;
- let AddedComplexity = 10 in
- def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
- (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+ (UNPCKLPDrr VR128:$src, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -2579,29 +2568,12 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
Domain d> {
def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set GR32:$dst, (Int RC:$src))], d>;
+ [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>;
def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W;
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
+ IIC_SSE_MOVMSK, d>, REX_W;
}
-defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
- SSEPackedSingle>, TB;
-defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
- SSEPackedDouble>, TB, OpSize;
-
-def : Pat<(i32 (X86fgetsign FR32:$src)),
- (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
- sub_ss))>, Requires<[HasSSE1]>;
-def : Pat<(i64 (X86fgetsign FR32:$src)),
- (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
- sub_ss))>, Requires<[HasSSE1]>;
-def : Pat<(i32 (X86fgetsign FR64:$src)),
- (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
- sub_sd))>, Requires<[HasSSE2]>;
-def : Pat<(i64 (X86fgetsign FR64:$src)),
- (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
- sub_sd))>, Requires<[HasSSE2]>;
-
let Predicates = [HasAVX] in {
defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
"movmskps", SSEPackedSingle>, TB, VEX;
@@ -2629,17 +2601,105 @@ let Predicates = [HasAVX] in {
// Assembler Only
def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
- "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+ "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedSingle>, TB, VEX;
def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
- "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+ "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedDouble>, TB,
OpSize, VEX;
def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
- "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+ "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedSingle>, TB, VEX;
def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
- "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+ "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+ SSEPackedDouble>, TB,
OpSize, VEX;
}
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+ SSEPackedSingle>, TB;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+ SSEPackedDouble>, TB, OpSize;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+ (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+ sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+ (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+ sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+ (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+ sub_sd))>, Requires<[HasSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+ (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+ sub_sd))>, Requires<[HasSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0,
+ bit Is2Addr = 1> {
+ let isCommutable = IsCommutable in
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))],
+ itins.rm>;
+}
+} // ExeDomain = SSEPackedInt
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+let Predicates = [HasAVX] in {
+defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1>;
+defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1>;
+defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 1>;
+defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
+ i128mem, SSE_BIT_ITINS_P, 0>;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX2] in {
+defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
+ i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
+ i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
+ i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
+ i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+}
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Logical Instructions
//===----------------------------------------------------------------------===//
@@ -2647,31 +2707,39 @@ let Predicates = [HasAVX] in {
/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
///
multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V;
+ FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+ TB, VEX_4V;
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V;
+ FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
+ TB, OpSize, VEX_4V;
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
- f32, f128mem, memopfsf32, SSEPackedSingle>, TB;
+ f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
+ TB;
defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
- f64, f128mem, memopfsf64, SSEPackedDouble>, TB, OpSize;
+ f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
+ TB, OpSize;
}
}
// Alias bitwise logical operations using SSE logical ops on packed FP values.
let mayLoad = 0 in {
- defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>;
- defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>;
- defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>;
+ defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+ SSE_BIT_ITINS_P>;
+ defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+ SSE_BIT_ITINS_P>;
+ defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+ SSE_BIT_ITINS_P>;
}
let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
- defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>;
+ defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+ SSE_BIT_ITINS_P>;
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
///
@@ -2758,118 +2826,145 @@ let isCommutable = 0 in
/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
/// classes below
multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins,
bit Is2Addr = 1> {
defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
- OpNode, FR32, f32mem, Is2Addr>, XS;
+ OpNode, FR32, f32mem,
+ itins.s, Is2Addr>, XS;
defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
- OpNode, FR64, f64mem, Is2Addr>, XD;
+ OpNode, FR64, f64mem,
+ itins.d, Is2Addr>, XD;
}
multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins,
bit Is2Addr = 1> {
let mayLoad = 0 in {
defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
- v4f32, f128mem, memopv4f32, SSEPackedSingle, Is2Addr>, TB;
+ v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
+ TB;
defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
- v2f64, f128mem, memopv2f64, SSEPackedDouble, Is2Addr>, TB, OpSize;
+ v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
+ TB, OpSize;
}
}
multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode,
+ SizeItins itins> {
let mayLoad = 0 in {
defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
- v8f32, f256mem, memopv8f32, SSEPackedSingle, 0>, TB;
+ v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
+ TB;
defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
- v4f64, f256mem, memopv4f64, SSEPackedDouble, 0>, TB, OpSize;
+ v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
+ TB, OpSize;
}
}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SizeItins itins,
bit Is2Addr = 1> {
defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS;
+ !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ itins.s, Is2Addr>, XS;
defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, Is2Addr>, XD;
+ !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ itins.d, Is2Addr>, XD;
}
multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
+ SizeItins itins,
bit Is2Addr = 1> {
defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
!strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32,
- SSEPackedSingle, Is2Addr>, TB;
+ SSEPackedSingle, itins.s, Is2Addr>,
+ TB;
defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
!strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64,
- SSEPackedDouble, Is2Addr>, TB, OpSize;
+ SSEPackedDouble, itins.d, Is2Addr>,
+ TB, OpSize;
}
-multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
+multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr,
+ SizeItins itins> {
defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
!strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
- SSEPackedSingle, 0>, TB;
+ SSEPackedSingle, itins.s, 0>, TB;
defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
!strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
- SSEPackedDouble, 0>, TB, OpSize;
+ SSEPackedDouble, itins.d, 0>, TB, OpSize;
}
// Binary Arithmetic instructions
-defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
- basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG;
-defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
- basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
-defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
- basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG;
-defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
- basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
+defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+ VEX_4V;
+defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+ VEX_4V;
let isCommutable = 0 in {
- defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
- basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG;
- defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
- basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
- defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
- basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG;
- defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
- basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
- defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
- basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG;
- defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
- basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
- basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
- basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
- defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
- basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG;
- defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
- basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
- basic_sse12_fp_binop_p_y_int<0x5D, "min">,
- basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
+ defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, VEX_4V;
+ defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+ VEX_4V;
+ defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_p_y_int<0x5F, "max", SSE_ALU_ITINS_P>,
+ VEX_4V;
+ defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
+ VEX_4V, VEX_LIG;
+ defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P, 0>,
+ basic_sse12_fp_binop_p_y_int<0x5D, "min", SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+ VEX_4V;
}
let Constraints = "$src1 = $dst" in {
- defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>,
- basic_sse12_fp_binop_p<0x58, "add", fadd>,
- basic_sse12_fp_binop_s_int<0x58, "add">;
- defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>,
- basic_sse12_fp_binop_p<0x59, "mul", fmul>,
- basic_sse12_fp_binop_s_int<0x59, "mul">;
+ defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+ defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+ basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
let isCommutable = 0 in {
- defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>,
- basic_sse12_fp_binop_p<0x5C, "sub", fsub>,
- basic_sse12_fp_binop_s_int<0x5C, "sub">;
- defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>,
- basic_sse12_fp_binop_p<0x5E, "div", fdiv>,
- basic_sse12_fp_binop_s_int<0x5E, "div">;
- defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>,
- basic_sse12_fp_binop_p<0x5F, "max", X86fmax>,
- basic_sse12_fp_binop_s_int<0x5F, "max">,
- basic_sse12_fp_binop_p_int<0x5F, "max">;
- defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>,
- basic_sse12_fp_binop_p<0x5D, "min", X86fmin>,
- basic_sse12_fp_binop_s_int<0x5D, "min">,
- basic_sse12_fp_binop_p_int<0x5D, "min">;
+ defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+ defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+ basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+ defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P>;
+ defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>,
+ basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P>;
}
}
@@ -2881,9 +2976,25 @@ let Constraints = "$src1 = $dst" in {
///
/// And, we have a special variant form for a full-vector intrinsic form.
+def SSE_SQRTP : OpndItins<
+ IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+>;
+
+def SSE_SQRTS : OpndItins<
+ IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+>;
+
+def SSE_RCPP : OpndItins<
+ IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+ IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+
/// sse1_fp_unop_s - SSE1 unops in scalar form.
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
- SDNode OpNode, Intrinsic F32Int> {
+ SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
[(set FR32:$dst, (OpNode FR32:$src))]>;
@@ -2893,14 +3004,14 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
// partial register update condition.
def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
- [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
+ [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
Requires<[HasSSE1, OptForSize]>;
def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (F32Int VR128:$src))]>;
+ [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+ [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>;
}
/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
@@ -2919,64 +3030,72 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
}
/// sse1_fp_unop_p - SSE1 unops in packed form.
-multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>;
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>;
def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>;
}
/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
-multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>;
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+ itins.rr>;
def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>;
+ [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+ itins.rm>;
}
/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
- Intrinsic V4F32Int> {
+ Intrinsic V4F32Int, OpndItins itins> {
def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V4F32Int VR128:$src))]>;
+ [(set VR128:$dst, (V4F32Int VR128:$src))],
+ itins.rr>;
def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+ itins.rm>;
}
/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms.
multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
- Intrinsic V4F32Int> {
+ Intrinsic V4F32Int, OpndItins itins> {
def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V4F32Int VR256:$src))]>;
+ [(set VR256:$dst, (V4F32Int VR256:$src))],
+ itins.rr>;
def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))]>;
+ [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
+ itins.rm>;
}
/// sse2_fp_unop_s - SSE2 unops in scalar form.
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
- SDNode OpNode, Intrinsic F64Int> {
+ SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
!strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
- [(set FR64:$dst, (OpNode FR64:$src))]>;
+ [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>;
// See the comments in sse1_fp_unop_s for why this is OptForSize.
def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
!strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
- [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD,
+ [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
Requires<[HasSSE2, OptForSize]>;
def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (F64Int VR128:$src))]>;
+ [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
!strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+ [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>;
}
/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
@@ -2998,45 +3117,52 @@ multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>;
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>;
def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>;
}
/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
-multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>;
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+ itins.rr>;
def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>;
+ [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+ itins.rm>;
}
/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
- Intrinsic V2F64Int> {
+ Intrinsic V2F64Int, OpndItins itins> {
def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V2F64Int VR128:$src))]>;
+ [(set VR128:$dst, (V2F64Int VR128:$src))],
+ itins.rr>;
def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+ [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))],
+ itins.rm>;
}
/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms.
multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
- Intrinsic V2F64Int> {
+ Intrinsic V2F64Int, OpndItins itins> {
def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V2F64Int VR256:$src))]>;
+ [(set VR256:$dst, (V2F64Int VR256:$src))],
+ itins.rr>;
def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>;
+ [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
+ itins.rm>;
}
let Predicates = [HasAVX] in {
@@ -3044,31 +3170,40 @@ let Predicates = [HasAVX] in {
defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt">,
sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
- defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
- sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
- sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
- sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
- sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps>,
- sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd>,
- sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256>,
- sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256>,
+ defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
+ sse2_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
+ sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps,
+ SSE_SQRTP>,
+ sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd,
+ SSE_SQRTP>,
+ sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256,
+ SSE_SQRTP>,
+ sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256,
+ SSE_SQRTP>,
VEX;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
- defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
- sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>,
- sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>,
- sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX;
+ defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256,
+ SSE_SQRTP>,
+ sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps,
+ SSE_SQRTP>, VEX;
defm VRCP : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
- defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
- sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>,
- sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>,
- sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX;
+ defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp, SSE_RCPP>,
+ sse1_fp_unop_p_y<0x53, "vrcp", X86frcp, SSE_RCPP>,
+ sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256,
+ SSE_RCPP>,
+ sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps,
+ SSE_RCPP>, VEX;
}
+let AddedComplexity = 1 in {
def : Pat<(f32 (fsqrt FR32:$src)),
(VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -3091,8 +3226,9 @@ def : Pat<(f32 (X86frcp FR32:$src)),
def : Pat<(f32 (X86frcp (load addr:$src))),
(VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
Requires<[HasAVX, OptForSize]>;
+}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX], AddedComplexity = 1 in {
def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
(VSQRTSSr (f32 (IMPLICIT_DEF)),
@@ -3127,21 +3263,26 @@ let Predicates = [HasAVX] in {
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt>,
- sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>,
- sse2_fp_unop_p<0x51, "sqrt", fsqrt>,
- sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>;
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
+ SSE_SQRTS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>,
+ sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps, SSE_SQRTS>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd,
+ SSE_SQRTS>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>,
+ sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>,
- sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>;
-defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp>,
- sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
+ SSE_SQRTS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
+ sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
+ SSE_SQRTS>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
+ SSE_RCPS>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
+ sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -3154,24 +3295,22 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
(ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f32 VR128:$src),
- addr:$dst)]>, VEX;
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v2f64 VR128:$src),
- addr:$dst)]>, VEX;
- def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
- (ins f128mem:$dst, VR128:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v2f64 VR128:$src),
- addr:$dst)]>, VEX;
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
let ExeDomain = SSEPackedInt in
def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v4f32 VR128:$src),
- addr:$dst)]>, VEX;
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
(VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
@@ -3180,23 +3319,21 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
(ins f256mem:$dst, VR256:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v8f32 VR256:$src),
- addr:$dst)]>, VEX;
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f64 VR256:$src),
- addr:$dst)]>, VEX;
- def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
- (ins f256mem:$dst, VR256:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v4f64 VR256:$src),
- addr:$dst)]>, VEX;
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
let ExeDomain = SSEPackedInt in
def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src),
"movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v8f32 VR256:$src),
- addr:$dst)]>, VEX;
+ [(alignednontemporalstore (v4i64 VR256:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX;
}
def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
@@ -3209,19 +3346,18 @@ def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
-
-def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
let ExeDomain = SSEPackedInt in
def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+ [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+ IIC_SSE_MOVNT>;
def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
(MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
@@ -3229,11 +3365,13 @@ def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
// There is no AVX form for instructions below this point
def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti{l}\t{$src, $dst|$dst, $src}",
- [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
TB, Requires<[HasSSE2]>;
def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movnti{q}\t{$src, $dst|$dst, $src}",
- [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+ IIC_SSE_MOVNT>,
TB, Requires<[HasSSE2]>;
}
@@ -3242,31 +3380,40 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
//===----------------------------------------------------------------------===//
// Prefetch intrinsic.
-def PREFETCHT0 : VoPSI<0x18, MRM1m, (outs), (ins i8mem:$src),
- "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>;
-def PREFETCHT1 : VoPSI<0x18, MRM2m, (outs), (ins i8mem:$src),
- "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>;
-def PREFETCHT2 : VoPSI<0x18, MRM3m, (outs), (ins i8mem:$src),
- "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>;
-def PREFETCHNTA : VoPSI<0x18, MRM0m, (outs), (ins i8mem:$src),
- "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
+let Predicates = [HasSSE1] in {
+def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+ IIC_SSE_PREFETCH>, TB;
+}
// Flush cache
def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
- "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
- TB, Requires<[HasSSE2]>;
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+ IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
// Pause. This "instruction" is encoded as "rep; nop", so even though it
// was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
// Load, store, and memory fence
def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
- "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>;
+ "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+ TB, Requires<[HasSSE1]>;
def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
- "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+ "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+ TB, Requires<[HasSSE2]>;
def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
- "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+ "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+ TB, Requires<[HasSSE2]>;
def : Pat<(X86SFence), (SFENCE)>;
def : Pat<(X86LFence), (LFENCE)>;
@@ -3277,14 +3424,18 @@ def : Pat<(X86MFence), (MFENCE)>;
//===----------------------------------------------------------------------===//
def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
- "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>, VEX;
def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
- "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>, VEX;
def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
- "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+ IIC_SSE_LDMXCSR>;
def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
- "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+ IIC_SSE_STMXCSR>;
//===---------------------------------------------------------------------===//
// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3294,108 +3445,134 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
let neverHasSideEffects = 1 in {
def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX;
def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+ VEX;
}
def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX;
def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+ VEX;
// For Disassembler
let isCodeGenOnly = 1 in {
def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>,
+ VEX;
def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>,
+ VEX;
def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>,
+ VEX;
def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
- "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqu\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVU_P_RR>,
+ VEX;
}
let canFoldAsLoad = 1, mayLoad = 1 in {
def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX;
def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+ VEX;
let Predicates = [HasAVX] in {
def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX;
def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+ XS, VEX;
}
}
let mayStore = 1 in {
def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX;
def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+ VEX;
let Predicates = [HasAVX] in {
def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX;
def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
- "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+ "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+ XS, VEX;
}
}
let neverHasSideEffects = 1 in
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>;
+ "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- []>, XS, Requires<[HasSSE2]>;
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
// For Disassembler
let isCodeGenOnly = 1 in {
def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
- "movdqa\t{$src, $dst|$dst, $src}", []>;
+ "movdqa\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVA_P_RR>;
def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- []>, XS, Requires<[HasSSE2]>;
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
}
let canFoldAsLoad = 1, mayLoad = 1 in {
def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqa\t{$src, $dst|$dst, $src}",
- [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+ IIC_SSE_MOVA_P_RM>;
def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+ IIC_SSE_MOVU_P_RM>,
XS, Requires<[HasSSE2]>;
}
let mayStore = 1 in {
def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}",
- [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVA_P_MR>;
def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+ IIC_SSE_MOVU_P_MR>,
XS, Requires<[HasSSE2]>;
}
// Intrinsic forms of MOVDQU load and store
def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"vmovdqu\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+ [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
+ IIC_SSE_MOVU_P_MR>,
XS, VEX, Requires<[HasAVX]>;
def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+ [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
+ IIC_SSE_MOVU_P_MR>,
XS, Requires<[HasSSE2]>;
} // ExeDomain = SSEPackedInt
let Predicates = [HasAVX] in {
- def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
(VMOVDQUYmr addr:$dst, VR256:$src)>;
}
@@ -3404,11 +3581,17 @@ let Predicates = [HasAVX] in {
// SSE2 - Packed Integer Arithmetic Instructions
//===---------------------------------------------------------------------===//
+def SSE_PMADD : OpndItins<
+ IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
let ExeDomain = SSEPackedInt in { // SSE integer instructions
multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, bit IsCommutable = 0,
+ X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0,
bit Is2Addr = 1> {
let isCommutable = IsCommutable in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
@@ -3416,58 +3599,64 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>;
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))]>;
+ [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+ itins.rm>;
}
-multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
- string OpcodeStr, Intrinsic IntId,
- Intrinsic IntId2, RegisterClass RC,
- bit Is2Addr = 1> {
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+ ShiftOpndItins itins,
+ bit Is2Addr = 1> {
// src2 is always 128-bit
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, VR128:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, VR128:$src2))]>;
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ itins.rr>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, i128mem:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, (bitconvert (memopv2i64 addr:$src2))))]>;
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>;
def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
(ins RC:$src1, i32i8imm:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId2 RC:$src1, (i32 imm:$src2)))]>;
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>;
}
-/// PDI_binop_rm - Simple SSE2 binary operator.
-multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, bit IsCommutable = 0,
- bit Is2Addr = 1> {
+/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins,
+ bit IsCommutable = 0, bit Is2Addr = 1> {
let isCommutable = IsCommutable in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
(bitconvert (memop_frag addr:$src2)))))]>;
}
} // ExeDomain = SSEPackedInt
@@ -3476,185 +3665,242 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX] in {
defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, VR128, memopv2i64,
- i128mem, 1, 0 /*3addr*/>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 1, 0 /*3addr*/>,
+ VEX_4V;
defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDQ : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
+ i128mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
+ i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64,
- i128mem, 0, 0>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64,
- i128mem, 0, 0>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
- i128mem, 0, 0>, VEX_4V;
+ i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
- i128mem, 0, 0>, VEX_4V;
+ i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V;
// Intrinsic forms
defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
-defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_PMADD, 1, 0>, VEX_4V;
defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
}
let Predicates = [HasAVX2] in {
defm VPADDBY : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDWY : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDDY : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDQY : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
+ i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
+ i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPSUBBY : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
- i256mem, 0, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBWY : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
- i256mem, 0, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
- i256mem, 0, 0>, VEX_4V;
+ i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
- i256mem, 0, 0>, VEX_4V;
+ i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+ VR256, memopv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
// Intrinsic forms
defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDSWY : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
-defm VPMULUDQY : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_avx2_pmulu_dq,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_PMADD, 1, 0>, VEX_4V;
defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPAVGWY : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMINUBY : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMINSWY : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMAXUBY : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPMAXSWY : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
defm VPSADBWY : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
}
let Constraints = "$src1 = $dst" in {
defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, VR128, memopv2i64,
- i128mem, 1>;
+ i128mem, SSE_INTALU_ITINS_P, 1>;
defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64,
- i128mem, 1>;
+ i128mem, SSE_INTALU_ITINS_P, 1>;
defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64,
- i128mem, 1>;
+ i128mem, SSE_INTALU_ITINS_P, 1>;
defm PADDQ : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64,
- i128mem, 1>;
+ i128mem, SSE_INTALUQ_ITINS_P, 1>;
defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64,
- i128mem, 1>;
+ i128mem, SSE_INTMUL_ITINS_P, 1>;
defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64,
- i128mem>;
+ i128mem, SSE_INTALU_ITINS_P>;
defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64,
- i128mem>;
+ i128mem, SSE_INTALU_ITINS_P>;
defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
- i128mem>;
+ i128mem, SSE_INTALU_ITINS_P>;
defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
- i128mem>;
+ i128mem, SSE_INTALUQ_ITINS_P>;
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
// Intrinsic forms
defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1>;
defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
- VR128, memopv2i64, i128mem, 1>;
-defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTMUL_ITINS_P, 1>;
defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_PMADD, 1>;
defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
- VR128, memopv2i64, i128mem, 1>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
} // Constraints = "$src1 = $dst"
@@ -3663,159 +3909,138 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
//===---------------------------------------------------------------------===//
let Predicates = [HasAVX] in {
-defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
- int_x86_sse2_psll_w, int_x86_sse2_pslli_w,
- VR128, 0>, VEX_4V;
-defm VPSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld",
- int_x86_sse2_psll_d, int_x86_sse2_pslli_d,
- VR128, 0>, VEX_4V;
-defm VPSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq",
- int_x86_sse2_psll_q, int_x86_sse2_pslli_q,
- VR128, 0>, VEX_4V;
-
-defm VPSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw",
- int_x86_sse2_psrl_w, int_x86_sse2_psrli_w,
- VR128, 0>, VEX_4V;
-defm VPSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld",
- int_x86_sse2_psrl_d, int_x86_sse2_psrli_d,
- VR128, 0>, VEX_4V;
-defm VPSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq",
- int_x86_sse2_psrl_q, int_x86_sse2_psrli_q,
- VR128, 0>, VEX_4V;
-
-defm VPSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw",
- int_x86_sse2_psra_w, int_x86_sse2_psrai_w,
- VR128, 0>, VEX_4V;
-defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
- int_x86_sse2_psra_d, int_x86_sse2_psrai_d,
- VR128, 0>, VEX_4V;
-
-defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
-defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
-defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
- i128mem, 1, 0>, VEX_4V;
-defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64,
- i128mem, 0, 0>, VEX_4V;
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
let ExeDomain = SSEPackedInt in {
- let neverHasSideEffects = 1 in {
- // 128-bit logical shifts.
- def VPSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V;
- def VPSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V;
- // PSRADQri doesn't exist in SSE[1-3].
- }
-}
+ // 128-bit logical shifts.
+ def VPSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
+ VEX_4V;
+ def VPSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
+ VEX_4V;
+ // PSRADQri doesn't exist in SSE[1-3].
}
+} // Predicates = [HasAVX]
let Predicates = [HasAVX2] in {
-defm VPSLLWY : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
- int_x86_avx2_psll_w, int_x86_avx2_pslli_w,
- VR256, 0>, VEX_4V;
-defm VPSLLDY : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld",
- int_x86_avx2_psll_d, int_x86_avx2_pslli_d,
- VR256, 0>, VEX_4V;
-defm VPSLLQY : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq",
- int_x86_avx2_psll_q, int_x86_avx2_pslli_q,
- VR256, 0>, VEX_4V;
-
-defm VPSRLWY : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw",
- int_x86_avx2_psrl_w, int_x86_avx2_psrli_w,
- VR256, 0>, VEX_4V;
-defm VPSRLDY : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld",
- int_x86_avx2_psrl_d, int_x86_avx2_psrli_d,
- VR256, 0>, VEX_4V;
-defm VPSRLQY : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq",
- int_x86_avx2_psrl_q, int_x86_avx2_psrli_q,
- VR256, 0>, VEX_4V;
-
-defm VPSRAWY : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw",
- int_x86_avx2_psra_w, int_x86_avx2_psrai_w,
- VR256, 0>, VEX_4V;
-defm VPSRADY : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
- int_x86_avx2_psra_d, int_x86_avx2_psrai_d,
- VR256, 0>, VEX_4V;
-
-defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
-defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
-defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
- i256mem, 1, 0>, VEX_4V;
-defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
- i256mem, 0, 0>, VEX_4V;
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+ VR256, v4i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+ VR256, v4i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR256, v16i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+ VR256, v8i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
let ExeDomain = SSEPackedInt in {
- let neverHasSideEffects = 1 in {
- // 128-bit logical shifts.
- def VPSLLDQYri : PDIi8<0x73, MRM7r,
- (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V;
- def VPSRLDQYri : PDIi8<0x73, MRM3r,
- (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V;
- // PSRADQYri doesn't exist in SSE[1-3].
- }
-}
+ // 256-bit logical shifts.
+ def VPSLLDQYri : PDIi8<0x73, MRM7r,
+ (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+ "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
+ VEX_4V;
+ def VPSRLDQYri : PDIi8<0x73, MRM3r,
+ (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+ "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR256:$dst,
+ (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
+ VEX_4V;
+ // PSRADQYri doesn't exist in SSE[1-3].
}
+} // Predicates = [HasAVX2]
let Constraints = "$src1 = $dst" in {
-defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
- int_x86_sse2_psll_w, int_x86_sse2_pslli_w,
- VR128>;
-defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
- int_x86_sse2_psll_d, int_x86_sse2_pslli_d,
- VR128>;
-defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
- int_x86_sse2_psll_q, int_x86_sse2_pslli_q,
- VR128>;
-
-defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
- int_x86_sse2_psrl_w, int_x86_sse2_psrli_w,
- VR128>;
-defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
- int_x86_sse2_psrl_d, int_x86_sse2_psrli_d,
- VR128>;
-defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
- int_x86_sse2_psrl_q, int_x86_sse2_psrli_q,
- VR128>;
-
-defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
- int_x86_sse2_psra_w, int_x86_sse2_psrai_w,
- VR128>;
-defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
- int_x86_sse2_psra_d, int_x86_sse2_psrai_d,
- VR128>;
-
-defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64,
- i128mem, 1>;
-defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
- i128mem, 1>;
-defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
- i128mem, 1>;
-defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
- i128mem, 0>;
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ VR128, v2i64, v2i64, bc_v2i64,
+ SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16,
+ SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ VR128, v4i32, v4i32, bc_v4i32,
+ SSE_INTSHIFT_ITINS_P>;
let ExeDomain = SSEPackedInt in {
- let neverHasSideEffects = 1 in {
- // 128-bit logical shifts.
- def PSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
- "pslldq\t{$src2, $dst|$dst, $src2}", []>;
- def PSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
- "psrldq\t{$src2, $dst|$dst, $src2}", []>;
- // PSRADQri doesn't exist in SSE[1-3].
- }
+ // 128-bit logical shifts.
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pslldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "psrldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+ // PSRADQri doesn't exist in SSE[1-3].
}
} // Constraints = "$src1 = $dst"
@@ -3824,17 +4049,13 @@ let Predicates = [HasAVX] in {
(VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
(VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
- (VPSLLDQri VR128:$src1, imm:$src2)>;
- def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
- (VPSRLDQri VR128:$src1, imm:$src2)>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
// Shift up / down and insert zero's.
- def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
+ def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
(VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
- def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
+ def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
(VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
}
@@ -3843,10 +4064,6 @@ let Predicates = [HasAVX2] in {
(VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
(VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2),
- (VPSLLDQYri VR256:$src1, imm:$src2)>;
- def : Pat<(int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2),
- (VPSRLDQYri VR256:$src1, imm:$src2)>;
}
let Predicates = [HasSSE2] in {
@@ -3854,17 +4071,13 @@ let Predicates = [HasSSE2] in {
(PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
(PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
- def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
- (PSLLDQri VR128:$src1, imm:$src2)>;
- def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
- (PSRLDQri VR128:$src1, imm:$src2)>;
def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
(PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
// Shift up / down and insert zero's.
- def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
+ def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
(PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
- def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
+ def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
(PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
}
@@ -3873,177 +4086,106 @@ let Predicates = [HasSSE2] in {
//===---------------------------------------------------------------------===//
let Predicates = [HasAVX] in {
- defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
- defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
- defm VPCMPEQD : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_sse2_pcmpeq_d,
- VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
- defm VPCMPGTB : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_sse2_pcmpgt_b,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
- defm VPCMPGTW : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_sse2_pcmpgt_w,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
- defm VPCMPGTD : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
-
- def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
- (VPCMPEQBrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v16i8 (X86pcmpeqb VR128:$src1,
- (bc_v16i8 (memopv2i64 addr:$src2)))),
- (VPCMPEQBrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
- (VPCMPEQWrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86pcmpeqw VR128:$src1,
- (bc_v8i16 (memopv2i64 addr:$src2)))),
- (VPCMPEQWrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
- (VPCMPEQDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86pcmpeqd VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (VPCMPEQDrm VR128:$src1, addr:$src2)>;
-
- def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
- (VPCMPGTBrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v16i8 (X86pcmpgtb VR128:$src1,
- (bc_v16i8 (memopv2i64 addr:$src2)))),
- (VPCMPGTBrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
- (VPCMPGTWrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86pcmpgtw VR128:$src1,
- (bc_v8i16 (memopv2i64 addr:$src2)))),
- (VPCMPGTWrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
- (VPCMPGTDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86pcmpgtd VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (VPCMPGTDrm VR128:$src1, addr:$src2)>;
+ defm VPCMPEQB : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v16i8,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPEQW : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v8i16,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPEQD : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPGTB : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v16i8,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+ defm VPCMPGTW : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v8i16,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+ defm VPCMPGTD : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
}
let Predicates = [HasAVX2] in {
- defm VPCMPEQBY : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_avx2_pcmpeq_b,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
- defm VPCMPEQWY : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_avx2_pcmpeq_w,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
- defm VPCMPEQDY : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_avx2_pcmpeq_d,
- VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
- defm VPCMPGTBY : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_avx2_pcmpgt_b,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
- defm VPCMPGTWY : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_avx2_pcmpgt_w,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
- defm VPCMPGTDY : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_avx2_pcmpgt_d,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
-
- def : Pat<(v32i8 (X86pcmpeqb VR256:$src1, VR256:$src2)),
- (VPCMPEQBYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v32i8 (X86pcmpeqb VR256:$src1,
- (bc_v32i8 (memopv4i64 addr:$src2)))),
- (VPCMPEQBYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v16i16 (X86pcmpeqw VR256:$src1, VR256:$src2)),
- (VPCMPEQWYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v16i16 (X86pcmpeqw VR256:$src1,
- (bc_v16i16 (memopv4i64 addr:$src2)))),
- (VPCMPEQWYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v8i32 (X86pcmpeqd VR256:$src1, VR256:$src2)),
- (VPCMPEQDYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86pcmpeqd VR256:$src1,
- (bc_v8i32 (memopv4i64 addr:$src2)))),
- (VPCMPEQDYrm VR256:$src1, addr:$src2)>;
-
- def : Pat<(v32i8 (X86pcmpgtb VR256:$src1, VR256:$src2)),
- (VPCMPGTBYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v32i8 (X86pcmpgtb VR256:$src1,
- (bc_v32i8 (memopv4i64 addr:$src2)))),
- (VPCMPGTBYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v16i16 (X86pcmpgtw VR256:$src1, VR256:$src2)),
- (VPCMPGTWYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v16i16 (X86pcmpgtw VR256:$src1,
- (bc_v16i16 (memopv4i64 addr:$src2)))),
- (VPCMPGTWYrm VR256:$src1, addr:$src2)>;
- def : Pat<(v8i32 (X86pcmpgtd VR256:$src1, VR256:$src2)),
- (VPCMPGTDYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86pcmpgtd VR256:$src1,
- (bc_v8i32 (memopv4i64 addr:$src2)))),
- (VPCMPGTDYrm VR256:$src1, addr:$src2)>;
+ defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+ defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+ defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+ defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
}
let Constraints = "$src1 = $dst" in {
- defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b,
- VR128, memopv2i64, i128mem, 1>;
- defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w,
- VR128, memopv2i64, i128mem, 1>;
- defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d,
- VR128, memopv2i64, i128mem, 1>;
- defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b,
- VR128, memopv2i64, i128mem>;
- defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w,
- VR128, memopv2i64, i128mem>;
- defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d,
- VR128, memopv2i64, i128mem>;
+ defm PCMPEQB : PDI_binop_rm<0x74, "pcmpeqb", X86pcmpeq, v16i8,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
+ defm PCMPEQW : PDI_binop_rm<0x75, "pcmpeqw", X86pcmpeq, v8i16,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
+ defm PCMPEQD : PDI_binop_rm<0x76, "pcmpeqd", X86pcmpeq, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 1>;
+ defm PCMPGTB : PDI_binop_rm<0x64, "pcmpgtb", X86pcmpgt, v16i8,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
+ defm PCMPGTW : PDI_binop_rm<0x65, "pcmpgtw", X86pcmpgt, v8i16,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
+ defm PCMPGTD : PDI_binop_rm<0x66, "pcmpgtd", X86pcmpgt, v4i32,
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
} // Constraints = "$src1 = $dst"
-let Predicates = [HasSSE2] in {
- def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
- (PCMPEQBrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v16i8 (X86pcmpeqb VR128:$src1,
- (bc_v16i8 (memopv2i64 addr:$src2)))),
- (PCMPEQBrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
- (PCMPEQWrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86pcmpeqw VR128:$src1,
- (bc_v8i16 (memopv2i64 addr:$src2)))),
- (PCMPEQWrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
- (PCMPEQDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86pcmpeqd VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (PCMPEQDrm VR128:$src1, addr:$src2)>;
-
- def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
- (PCMPGTBrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v16i8 (X86pcmpgtb VR128:$src1,
- (bc_v16i8 (memopv2i64 addr:$src2)))),
- (PCMPGTBrm VR128:$src1, addr:$src2)>;
- def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
- (PCMPGTWrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86pcmpgtw VR128:$src1,
- (bc_v8i16 (memopv2i64 addr:$src2)))),
- (PCMPGTWrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
- (PCMPGTDrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86pcmpgtd VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))),
- (PCMPGTDrm VR128:$src1, addr:$src2)>;
-}
-
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Pack Instructions
//===---------------------------------------------------------------------===//
let Predicates = [HasAVX] in {
defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
- VR128, memopv2i64, i128mem, 0, 0>, VEX_4V;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
}
let Predicates = [HasAVX2] in {
defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
- VR256, memopv4i64, i256mem, 0, 0>, VEX_4V;
+ VR256, memopv4i64, i256mem,
+ SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
}
let Constraints = "$src1 = $dst" in {
defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
- VR128, memopv2i64, i128mem>;
+ VR128, memopv2i64, i128mem,
+ SSE_INTALU_ITINS_P>;
} // Constraints = "$src1 = $dst"
//===---------------------------------------------------------------------===//
@@ -4051,134 +4193,75 @@ defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
//===---------------------------------------------------------------------===//
let ExeDomain = SSEPackedInt in {
-multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, PatFrag pshuf_frag,
- PatFrag bc_frag> {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, SDNode OpNode> {
def ri : Ii8<0x70, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (vt (pshuf_frag:$src2 VR128:$src1,
- (undef))))]>;
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>;
def mi : Ii8<0x70, MRMSrcMem,
- (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (vt (pshuf_frag:$src2
- (bc_frag (memopv2i64 addr:$src1)),
- (undef))))]>;
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>;
}
-multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, PatFrag pshuf_frag,
- PatFrag bc_frag> {
+multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, SDNode OpNode> {
def Yri : Ii8<0x70, MRMSrcReg,
(outs VR256:$dst), (ins VR256:$src1, i8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (vt (pshuf_frag:$src2 VR256:$src1,
- (undef))))]>;
+ [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>;
def Ymi : Ii8<0x70, MRMSrcMem,
(outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (vt (pshuf_frag:$src2
- (bc_frag (memopv4i64 addr:$src1)),
- (undef))))]>;
+ [(set VR256:$dst,
+ (vt (OpNode (bitconvert (memopv4i64 addr:$src1)),
+ (i8 imm:$src2))))]>;
}
} // ExeDomain = SSEPackedInt
let Predicates = [HasAVX] in {
- let AddedComplexity = 5 in
- defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize,
- VEX;
-
- // SSE2 with ImmT == Imm8 and XS prefix.
- defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, pshufhw, bc_v8i16>, XS,
- VEX;
-
- // SSE2 with ImmT == Imm8 and XD prefix.
- defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD,
- VEX;
-
- let AddedComplexity = 5 in
- def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
- (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
- // Unary v4f32 shuffle with VPSHUF* in order to fold a load.
- def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
- (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
-
- def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
- (i8 imm:$imm))),
- (VPSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
- (i8 imm:$imm))),
- (VPSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (VPSHUFDri VR128:$src1, imm:$imm)>;
- def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (VPSHUFDri VR128:$src1, imm:$imm)>;
- def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
- (VPSHUFHWri VR128:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
- (i8 imm:$imm))),
- (VPSHUFHWmi addr:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
- (VPSHUFLWri VR128:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
- (i8 imm:$imm))),
- (VPSHUFLWmi addr:$src, imm:$imm)>;
-}
+ let AddedComplexity = 5 in
+ defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, X86PShufd>, TB, OpSize, VEX;
-let Predicates = [HasAVX2] in {
- let AddedComplexity = 5 in
- defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, pshufd, bc_v8i32>, TB,
- OpSize, VEX;
+ // SSE2 with ImmT == Imm8 and XS prefix.
+ defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, X86PShufhw>, XS, VEX;
- // SSE2 with ImmT == Imm8 and XS prefix.
- defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, pshufhw, bc_v16i16>, XS,
- VEX;
+ // SSE2 with ImmT == Imm8 and XD prefix.
+ defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, X86PShuflw>, XD, VEX;
+
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (VPSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
- // SSE2 with ImmT == Imm8 and XD prefix.
- defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, pshuflw, bc_v16i16>, XD,
- VEX;
+let Predicates = [HasAVX2] in {
+ defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX;
+ defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX;
+ defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
}
let Predicates = [HasSSE2] in {
- let AddedComplexity = 5 in
- defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize;
-
- // SSE2 with ImmT == Imm8 and XS prefix.
- defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, pshufhw, bc_v8i16>, XS;
-
- // SSE2 with ImmT == Imm8 and XD prefix.
- defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD;
-
- let AddedComplexity = 5 in
- def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
- (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
- // Unary v4f32 shuffle with PSHUF* in order to fold a load.
- def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
- (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
-
- def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
- (i8 imm:$imm))),
- (PSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
- (i8 imm:$imm))),
- (PSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (PSHUFDri VR128:$src1, imm:$imm)>;
- def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (PSHUFDri VR128:$src1, imm:$imm)>;
- def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
- (PSHUFHWri VR128:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
- (i8 imm:$imm))),
- (PSHUFHWmi addr:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
- (PSHUFLWri VR128:$src, imm:$imm)>;
- def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
- (i8 imm:$imm))),
- (PSHUFLWmi addr:$src, imm:$imm)>;
+ let AddedComplexity = 5 in
+ defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
+
+ // SSE2 with ImmT == Imm8 and XS prefix.
+ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, X86PShufhw>, XS;
+
+ // SSE2 with ImmT == Imm8 and XD prefix.
+ defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, X86PShuflw>, XD;
+
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (PSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (PSHUFDri VR128:$src1, imm:$imm)>;
}
//===---------------------------------------------------------------------===//
@@ -4193,7 +4276,8 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
!if(Is2Addr,
!strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>;
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+ IIC_SSE_UNPCK>;
def rm : PDI<opc, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
!if(Is2Addr,
@@ -4201,7 +4285,8 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst, (OpNode VR128:$src1,
(bc_frag (memopv2i64
- addr:$src2))))]>;
+ addr:$src2))))],
+ IIC_SSE_UNPCK>;
}
multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
@@ -4300,14 +4385,6 @@ let Predicates = [HasAVX] in {
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
}
-// Splat v2f64 / v2i64
-let AddedComplexity = 10 in {
- def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
- (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
- def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
- (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>;
-}
-
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Extract and Insert
//===---------------------------------------------------------------------===//
@@ -4321,7 +4398,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+ (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>;
def rmi : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
i16mem:$src2, i32i8imm:$src3),
@@ -4330,7 +4407,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
- imm:$src3))]>;
+ imm:$src3))], IIC_SSE_PINSRW>;
}
// Extract
@@ -4344,7 +4421,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
(outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
- imm:$src2))]>;
+ imm:$src2))], IIC_SSE_PEXTRW>;
// Insert
let Predicates = [HasAVX] in {
@@ -4368,9 +4445,10 @@ let ExeDomain = SSEPackedInt in {
def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>, VEX;
def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
- "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
+ "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
let Predicates = [HasAVX2] in {
def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
@@ -4382,7 +4460,8 @@ def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+ IIC_SSE_MOVMSK>;
} // ExeDomain = SSEPackedInt
@@ -4396,21 +4475,25 @@ let Uses = [EDI] in
def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, VEX;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>, VEX;
let Uses = [RDI] in
def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>, VEX;
let Uses = [EDI] in
def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+ IIC_SSE_MASKMOV>;
let Uses = [RDI] in
def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+ IIC_SSE_MASKMOV>;
} // ExeDomain = SSEPackedInt
@@ -4424,54 +4507,65 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector GR32:$src)))]>, VEX;
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+ VEX;
def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>,
VEX;
def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>, VEX;
def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>, VEX;
def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector GR32:$src)))]>;
+ (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>;
def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_SSE_MOVDQ>;
def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector GR64:$src)))]>;
+ (v2i64 (scalar_to_vector GR64:$src)))],
+ IIC_SSE_MOVDQ>;
def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert GR64:$src))]>;
+ [(set FR64:$dst, (bitconvert GR64:$src))],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Single Scalar
//
def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, VEX;
def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>,
VEX;
def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert GR32:$src))]>;
+ [(set FR32:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>;
def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int to Packed Double Int
@@ -4479,20 +4573,22 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
- (iPTR 0)))]>, VEX;
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX;
def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (vector_extract (v4i32 VR128:$src),
- (iPTR 0))), addr:$dst)]>, VEX;
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ VEX;
def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
- (iPTR 0)))]>;
+ (iPTR 0)))], IIC_SSE_MOVD_ToGP>;
def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (vector_extract (v4i32 VR128:$src),
- (iPTR 0))), addr:$dst)]>;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
@@ -4500,13 +4596,15 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
- (iPTR 0)))]>,
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>,
TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
- (iPTR 0)))]>;
+ (iPTR 0)))],
+ IIC_SSE_MOVD_ToGP>;
//===---------------------------------------------------------------------===//
// Bitcast FR64 <-> GR64
@@ -4518,36 +4616,45 @@ def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
VEX;
def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64:$src))]>;
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVDQ>, VEX;
def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+ IIC_SSE_MOVDQ>;
def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64:$src))]>;
+ [(set GR64:$dst, (bitconvert FR64:$src))],
+ IIC_SSE_MOVD_ToGP>;
def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Move Scalar Single to Double Int
//
def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>, VEX;
def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX;
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bitconvert FR32:$src))]>;
+ [(set GR32:$dst, (bitconvert FR32:$src))],
+ IIC_SSE_MOVD_ToGP>;
def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Patterns and instructions to describe movd/movq to XMM register zero-extends
@@ -4556,23 +4663,26 @@ let AddedComplexity = 15 in {
def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86vzmovl
- (v4i32 (scalar_to_vector GR32:$src)))))]>,
- VEX;
+ (v4i32 (scalar_to_vector GR32:$src)))))],
+ IIC_SSE_MOVDQ>, VEX;
def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
[(set VR128:$dst, (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector GR64:$src)))))]>,
+ (v2i64 (scalar_to_vector GR64:$src)))))],
+ IIC_SSE_MOVDQ>,
VEX, VEX_W;
}
let AddedComplexity = 15 in {
def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86vzmovl
- (v4i32 (scalar_to_vector GR32:$src)))))]>;
+ (v4i32 (scalar_to_vector GR32:$src)))))],
+ IIC_SSE_MOVDQ>;
def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
[(set VR128:$dst, (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector GR64:$src)))))]>;
+ (v2i64 (scalar_to_vector GR64:$src)))))],
+ IIC_SSE_MOVDQ>;
}
let AddedComplexity = 20 in {
@@ -4580,29 +4690,19 @@ def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86vzmovl (v4i32 (scalar_to_vector
- (loadi32 addr:$src))))))]>,
- VEX;
+ (loadi32 addr:$src))))))],
+ IIC_SSE_MOVDQ>, VEX;
def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86vzmovl (v4i32 (scalar_to_vector
- (loadi32 addr:$src))))))]>;
-}
-
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (MOVZDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
- (MOVZDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
- (MOVZDI2PDIrm addr:$src)>;
+ (loadi32 addr:$src))))))],
+ IIC_SSE_MOVDQ>;
}
let Predicates = [HasAVX] in {
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
let AddedComplexity = 20 in {
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVZDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
(VMOVZDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -4617,6 +4717,13 @@ let Predicates = [HasAVX] in {
(SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
}
+let Predicates = [HasSSE2], AddedComplexity = 20 in {
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+}
+
// These are the correct encodings of the instructions so that we know how to
// read correct assembly, even though we continue to emit the wrong ones for
// compatibility with Darwin's buggy assembler.
@@ -4648,7 +4755,8 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+ IIC_SSE_MOVDQ>, XS,
Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
//===---------------------------------------------------------------------===//
@@ -4657,11 +4765,13 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))), addr:$dst)]>, VEX;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>, VEX;
def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))), addr:$dst)]>;
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOVDQ>;
//===---------------------------------------------------------------------===//
// Store / copy lower 64-bits of a XMM register.
@@ -4671,14 +4781,16 @@ def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
[(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
+ IIC_SSE_MOVDQ>;
let AddedComplexity = 20 in
def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))]>,
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
XS, VEX, Requires<[HasAVX]>;
let AddedComplexity = 20 in
@@ -4686,9 +4798,19 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))]>,
+ (loadi64 addr:$src))))))],
+ IIC_SSE_MOVDQ>,
XS, Requires<[HasSSE2]>;
+let Predicates = [HasAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (VMOVZQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)),
+ (VMOVZQI2PQIrm addr:$src)>;
+}
+
let Predicates = [HasSSE2], AddedComplexity = 20 in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(MOVZQI2PQIrm addr:$src)>;
@@ -4697,13 +4819,9 @@ let Predicates = [HasSSE2], AddedComplexity = 20 in {
def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
}
-let Predicates = [HasAVX], AddedComplexity = 20 in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVZQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (VMOVZQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVZQI2PQIrm addr:$src)>;
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
}
//===---------------------------------------------------------------------===//
@@ -4713,51 +4831,58 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
let AddedComplexity = 15 in
def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
XS, VEX, Requires<[HasAVX]>;
let AddedComplexity = 15 in
def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+ IIC_SSE_MOVQ_RR>,
XS, Requires<[HasSSE2]>;
let AddedComplexity = 20 in
def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))]>,
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
XS, VEX, Requires<[HasAVX]>;
let AddedComplexity = 20 in {
def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))]>,
+ (loadv2i64 addr:$src))))],
+ IIC_SSE_MOVDQ>,
XS, Requires<[HasSSE2]>;
}
let AddedComplexity = 20 in {
- let Predicates = [HasSSE2] in {
- def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
- (MOVZPQILo2PQIrm addr:$src)>;
- def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
- (MOVZPQILo2PQIrr VR128:$src)>;
- }
let Predicates = [HasAVX] in {
- def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(VMOVZPQILo2PQIrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
(VMOVZPQILo2PQIrr VR128:$src)>;
}
+ let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVZPQILo2PQIrm addr:$src)>;
+ def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>;
+ }
}
// Instructions to match in the assembler
def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
- "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
- "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
// Recognize "movd" with GR64 destination, but encode as a "movq"
def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
- "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+ "movd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_MOVDQ>, VEX, VEX_W;
// Instructions for the disassembler
// xr = XMM register
@@ -4767,7 +4892,7 @@ let Predicates = [HasAVX] in
def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "movq\t{$src, $dst|$dst, $src}", []>, XS;
+ "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
//===---------------------------------------------------------------------===//
// SSE3 - Conversion Instructions
@@ -4797,14 +4922,16 @@ def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
}
def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>;
def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>;
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
- (VCVTPD2DQYrr VR256:$src)>;
+ (VCVTTPD2DQYrr VR256:$src)>;
def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
- (VCVTPD2DQYrm addr:$src)>;
+ (VCVTTPD2DQYrm addr:$src)>;
// Convert Packed DW Integers to Packed Double FP
let Predicates = [HasAVX] in {
@@ -4819,9 +4946,11 @@ def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
}
def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>;
def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>;
// AVX 256-bit register conversion intrinsics
def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
@@ -4847,10 +4976,12 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
X86MemOperand x86memop> {
def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (vt (OpNode RC:$src)))]>;
+ [(set RC:$dst, (vt (OpNode RC:$src)))],
+ IIC_SSE_MOV_LH>;
def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>;
+ [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+ IIC_SSE_MOV_LH>;
}
let Predicates = [HasAVX] in {
@@ -4868,17 +4999,6 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
memopv4f32, f128mem>;
-let Predicates = [HasSSE3] in {
- def : Pat<(v4i32 (X86Movshdup VR128:$src)),
- (MOVSHDUPrr VR128:$src)>;
- def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
- (MOVSHDUPrm addr:$src)>;
- def : Pat<(v4i32 (X86Movsldup VR128:$src)),
- (MOVSLDUPrr VR128:$src)>;
- def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
- (MOVSLDUPrm addr:$src)>;
-}
-
let Predicates = [HasAVX] in {
def : Pat<(v4i32 (X86Movshdup VR128:$src)),
(VMOVSHDUPrr VR128:$src)>;
@@ -4898,82 +5018,60 @@ let Predicates = [HasAVX] in {
(VMOVSLDUPYrm addr:$src)>;
}
+let Predicates = [HasSSE3] in {
+ def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+ (MOVSHDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSHDUPrm addr:$src)>;
+ def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+ (MOVSLDUPrr VR128:$src)>;
+ def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+ (MOVSLDUPrm addr:$src)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Double FP - MOVDDUP
//===---------------------------------------------------------------------===//
multiclass sse3_replicate_dfp<string OpcodeStr> {
+let neverHasSideEffects = 1 in
def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+ [], IIC_SSE_MOV_LH>;
def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
- (undef))))]>;
+ (v2f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))],
+ IIC_SSE_MOV_LH>;
}
// FIXME: Merge with above classe when there're patterns for the ymm version
multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>;
+def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (v4f64 (X86Movddup
+ (scalar_to_vector (loadf64 addr:$src)))))]>;
+}
+
let Predicates = [HasAVX] in {
- def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>;
- def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>;
- }
+ defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+ defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
}
defm MOVDDUP : sse3_replicate_dfp<"movddup">;
-defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
-defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
-
-let Predicates = [HasSSE3] in {
- def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
- (undef)),
- (MOVDDUPrm addr:$src)>;
- let AddedComplexity = 5 in {
- def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
- def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
- (MOVDDUPrm addr:$src)>;
- def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
- def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
- (MOVDDUPrm addr:$src)>;
- }
- def : Pat<(X86Movddup (memopv2f64 addr:$src)),
- (MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
- (MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
- (MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
- (MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (bc_v2f64
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (MOVDDUPrm addr:$src)>;
-}
let Predicates = [HasAVX] in {
- def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
- (undef)),
- (VMOVDDUPrm addr:$src)>;
- let AddedComplexity = 5 in {
- def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
- def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
- (VMOVDDUPrm addr:$src)>;
- def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
- def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
- (VMOVDDUPrm addr:$src)>;
- }
def : Pat<(X86Movddup (memopv2f64 addr:$src)),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
- def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
- (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (bc_v2f64
(v2i64 (scalar_to_vector (loadi64 addr:$src))))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
@@ -4983,16 +5081,24 @@ let Predicates = [HasAVX] in {
(VMOVDDUPYrm addr:$src)>;
def : Pat<(X86Movddup (memopv4i64 addr:$src)),
(VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))),
- (VMOVDDUPYrm addr:$src)>;
def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
(VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (v4f64 VR256:$src)),
- (VMOVDDUPYrr VR256:$src)>;
def : Pat<(X86Movddup (v4i64 VR256:$src)),
(VMOVDDUPYrr VR256:$src)>;
}
+let Predicates = [HasSSE3] in {
+ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+ (MOVDDUPrm addr:$src)>;
+ def : Pat<(X86Movddup (bc_v2f64
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVDDUPrm addr:$src)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 - Move Unaligned Integer
//===---------------------------------------------------------------------===//
@@ -5007,49 +5113,51 @@ let Predicates = [HasAVX] in {
}
def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"lddqu\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+ IIC_SSE_LDDQU>;
//===---------------------------------------------------------------------===//
// SSE3 - Arithmetic
//===---------------------------------------------------------------------===//
multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop, bit Is2Addr = 1> {
+ X86MemOperand x86memop, OpndItins itins,
+ bit Is2Addr = 1> {
def rr : I<0xD0, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))]>;
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>;
def rm : I<0xD0, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
+ [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
- f128mem, 0>, TB, XD, VEX_4V;
+ f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
- f256mem, 0>, TB, XD, VEX_4V;
+ f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
}
let ExeDomain = SSEPackedDouble in {
defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
- f128mem, 0>, TB, OpSize, VEX_4V;
+ f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
- f256mem, 0>, TB, OpSize, VEX_4V;
+ f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
}
}
let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
let ExeDomain = SSEPackedSingle in
defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
- f128mem>, TB, XD;
+ f128mem, SSE_ALU_F32P>, TB, XD;
let ExeDomain = SSEPackedDouble in
defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
- f128mem>, TB, OpSize;
+ f128mem, SSE_ALU_F64P>, TB, OpSize;
}
//===---------------------------------------------------------------------===//
@@ -5063,13 +5171,14 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>;
}
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
@@ -5077,13 +5186,14 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>;
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+ IIC_SSE_HADDSUB_RM>;
}
let Predicates = [HasAVX] in {
@@ -5131,7 +5241,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
OpSize;
def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
@@ -5139,7 +5249,8 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(IntId128
- (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
+ (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
+ OpSize;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
@@ -5188,9 +5299,52 @@ defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
// SSSE3 - Packed Binary Operator Instructions
//===---------------------------------------------------------------------===//
+def SSE_PHADDSUBD : OpndItins<
+ IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+ IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+ IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+def SSE_PSHUFB : OpndItins<
+ IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+def SSE_PSIGN : OpndItins<
+ IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+def SSE_PMULHRSW : OpndItins<
+ IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, OpndItins itins,
+ bit Is2Addr = 1> {
+ let isCommutable = 1 in
+ def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ OpSize;
+ def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize;
+}
+
/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
- Intrinsic IntId128, bit Is2Addr = 1> {
+ Intrinsic IntId128, OpndItins itins,
+ bit Is2Addr = 1> {
let isCommutable = 1 in
def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
@@ -5227,57 +5381,77 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
let ImmT = NoImm, Predicates = [HasAVX] in {
let isCommutable = 0 in {
- defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw",
- int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
- defm VPHADDD : SS3I_binop_rm_int<0x02, "vphaddd",
- int_x86_ssse3_phadd_d_128, 0>, VEX_4V;
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PHADDSUBD, 0>, VEX_4V;
+ defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+ memopv2i64, i128mem,
+ SSE_PSIGN, 0>, VEX_4V;
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+ memopv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
- int_x86_ssse3_phadd_sw_128, 0>, VEX_4V;
- defm VPHSUBW : SS3I_binop_rm_int<0x05, "vphsubw",
- int_x86_ssse3_phsub_w_128, 0>, VEX_4V;
- defm VPHSUBD : SS3I_binop_rm_int<0x06, "vphsubd",
- int_x86_ssse3_phsub_d_128, 0>, VEX_4V;
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW, 0>, VEX_4V;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
- int_x86_ssse3_phsub_sw_128, 0>, VEX_4V;
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW, 0>, VEX_4V;
defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V;
- defm VPSHUFB : SS3I_binop_rm_int<0x00, "vpshufb",
- int_x86_ssse3_pshuf_b_128, 0>, VEX_4V;
- defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
- int_x86_ssse3_psign_b_128, 0>, VEX_4V;
- defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
- int_x86_ssse3_psign_w_128, 0>, VEX_4V;
- defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
- int_x86_ssse3_psign_d_128, 0>, VEX_4V;
+ int_x86_ssse3_pmadd_ub_sw_128,
+ SSE_PMADD, 0>, VEX_4V;
}
defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V;
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW, 0>, VEX_4V;
}
let ImmT = NoImm, Predicates = [HasAVX2] in {
let isCommutable = 0 in {
- defm VPHADDW : SS3I_binop_rm_int_y<0x01, "vphaddw",
- int_x86_avx2_phadd_w>, VEX_4V;
- defm VPHADDD : SS3I_binop_rm_int_y<0x02, "vphaddd",
- int_x86_avx2_phadd_d>, VEX_4V;
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+ memopv4i64, i256mem,
+ SSE_PHADDSUBW, 0>, VEX_4V;
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
int_x86_avx2_phadd_sw>, VEX_4V;
- defm VPHSUBW : SS3I_binop_rm_int_y<0x05, "vphsubw",
- int_x86_avx2_phsub_w>, VEX_4V;
- defm VPHSUBD : SS3I_binop_rm_int_y<0x06, "vphsubd",
- int_x86_avx2_phsub_d>, VEX_4V;
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
int_x86_avx2_phsub_sw>, VEX_4V;
defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
int_x86_avx2_pmadd_ub_sw>, VEX_4V;
- defm VPSHUFB : SS3I_binop_rm_int_y<0x00, "vpshufb",
- int_x86_avx2_pshuf_b>, VEX_4V;
- defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb",
- int_x86_avx2_psign_b>, VEX_4V;
- defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw",
- int_x86_avx2_psign_w>, VEX_4V;
- defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd",
- int_x86_avx2_psign_d>, VEX_4V;
}
defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
int_x86_avx2_pmul_hr_sw>, VEX_4V;
@@ -5286,95 +5460,34 @@ defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
// None of these have i8 immediate fields.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
- defm PHADDW : SS3I_binop_rm_int<0x01, "phaddw",
- int_x86_ssse3_phadd_w_128>;
- defm PHADDD : SS3I_binop_rm_int<0x02, "phaddd",
- int_x86_ssse3_phadd_d_128>;
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBW>;
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PHADDSUBD>;
+ defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+ memopv2i64, i128mem, SSE_PSIGN>;
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+ memopv2i64, i128mem, SSE_PSHUFB>;
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
- int_x86_ssse3_phadd_sw_128>;
- defm PHSUBW : SS3I_binop_rm_int<0x05, "phsubw",
- int_x86_ssse3_phsub_w_128>;
- defm PHSUBD : SS3I_binop_rm_int<0x06, "phsubd",
- int_x86_ssse3_phsub_d_128>;
+ int_x86_ssse3_phadd_sw_128,
+ SSE_PHADDSUBSW>;
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
- int_x86_ssse3_phsub_sw_128>;
+ int_x86_ssse3_phsub_sw_128,
+ SSE_PHADDSUBSW>;
defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128>;
- defm PSHUFB : SS3I_binop_rm_int<0x00, "pshufb",
- int_x86_ssse3_pshuf_b_128>;
- defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb",
- int_x86_ssse3_psign_b_128>;
- defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw",
- int_x86_ssse3_psign_w_128>;
- defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd",
- int_x86_ssse3_psign_d_128>;
+ int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
}
defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128>;
-}
-
-let Predicates = [HasSSSE3] in {
- def : Pat<(X86pshufb VR128:$src, VR128:$mask),
- (PSHUFBrr128 VR128:$src, VR128:$mask)>;
- def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
- (PSHUFBrm128 VR128:$src, addr:$mask)>;
-
- def : Pat<(v16i8 (X86psign VR128:$src1, VR128:$src2)),
- (PSIGNBrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86psign VR128:$src1, VR128:$src2)),
- (PSIGNWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86psign VR128:$src1, VR128:$src2)),
- (PSIGNDrr128 VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v8i16 (X86hadd VR128:$src1, VR128:$src2)),
- (PHADDWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86hadd VR128:$src1, VR128:$src2)),
- (PHADDDrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86hsub VR128:$src1, VR128:$src2)),
- (PHSUBWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86hsub VR128:$src1, VR128:$src2)),
- (PHSUBDrr128 VR128:$src1, VR128:$src2)>;
-}
-
-let Predicates = [HasAVX] in {
- def : Pat<(X86pshufb VR128:$src, VR128:$mask),
- (VPSHUFBrr128 VR128:$src, VR128:$mask)>;
- def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
- (VPSHUFBrm128 VR128:$src, addr:$mask)>;
-
- def : Pat<(v16i8 (X86psign VR128:$src1, VR128:$src2)),
- (VPSIGNBrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86psign VR128:$src1, VR128:$src2)),
- (VPSIGNWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86psign VR128:$src1, VR128:$src2)),
- (VPSIGNDrr128 VR128:$src1, VR128:$src2)>;
-
- def : Pat<(v8i16 (X86hadd VR128:$src1, VR128:$src2)),
- (VPHADDWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86hadd VR128:$src1, VR128:$src2)),
- (VPHADDDrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v8i16 (X86hsub VR128:$src1, VR128:$src2)),
- (VPHSUBWrr128 VR128:$src1, VR128:$src2)>;
- def : Pat<(v4i32 (X86hsub VR128:$src1, VR128:$src2)),
- (VPHSUBDrr128 VR128:$src1, VR128:$src2)>;
-}
-
-let Predicates = [HasAVX2] in {
- def : Pat<(v32i8 (X86psign VR256:$src1, VR256:$src2)),
- (VPSIGNBrr256 VR256:$src1, VR256:$src2)>;
- def : Pat<(v16i16 (X86psign VR256:$src1, VR256:$src2)),
- (VPSIGNWrr256 VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86psign VR256:$src1, VR256:$src2)),
- (VPSIGNDrr256 VR256:$src1, VR256:$src2)>;
-
- def : Pat<(v16i16 (X86hadd VR256:$src1, VR256:$src2)),
- (VPHADDWrr256 VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86hadd VR256:$src1, VR256:$src2)),
- (VPHADDDrr256 VR256:$src1, VR256:$src2)>;
- def : Pat<(v16i16 (X86hsub VR256:$src1, VR256:$src2)),
- (VPHSUBWrr256 VR256:$src1, VR256:$src2)>;
- def : Pat<(v8i32 (X86hsub VR256:$src1, VR256:$src2)),
- (VPHSUBDrr256 VR256:$src1, VR256:$src2)>;
+ int_x86_ssse3_pmul_hr_sw_128,
+ SSE_PMULHRSW>;
}
//===---------------------------------------------------------------------===//
@@ -5389,7 +5502,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, OpSize;
+ [], IIC_SSE_PALIGNR>, OpSize;
let mayLoad = 1 in
def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5397,7 +5510,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, OpSize;
+ [], IIC_SSE_PALIGNR>, OpSize;
}
}
@@ -5424,15 +5537,15 @@ let Predicates = [HasAVX2] in
let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
defm PALIGN : ssse3_palign<"palignr">;
-let Predicates = [HasSSSE3] in {
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
}
let Predicates = [HasAVX] in {
@@ -5446,23 +5559,36 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
}
+let Predicates = [HasSSSE3] in {
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+ (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
//===---------------------------------------------------------------------===//
// SSSE3 - Thread synchronization
//===---------------------------------------------------------------------===//
let usesCustomInserter = 1 in {
def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
- [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>;
+ [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+ Requires<[HasSSE3]>;
def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2),
- [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>;
+ [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>,
+ Requires<[HasSSE3]>;
}
let Uses = [EAX, ECX, EDX] in
-def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB,
- Requires<[HasSSE3]>;
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+ TB, Requires<[HasSSE3]>;
let Uses = [ECX, EAX] in
-def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB,
- Requires<[HasSSE3]>;
+def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", [], IIC_SSE_MWAIT>,
+ TB, Requires<[HasSSE3]>;
def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
@@ -5536,70 +5662,80 @@ defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
-let Predicates = [HasSSE41] in {
+let Predicates = [HasAVX] in {
// Common patterns involving scalar load.
def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
- (PMOVSXBWrm addr:$src)>;
+ (VPMOVSXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
- (PMOVSXBWrm addr:$src)>;
+ (VPMOVSXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
- (PMOVSXWDrm addr:$src)>;
+ (VPMOVSXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
- (PMOVSXWDrm addr:$src)>;
+ (VPMOVSXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
- (PMOVSXDQrm addr:$src)>;
+ (VPMOVSXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
- (PMOVSXDQrm addr:$src)>;
+ (VPMOVSXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
- (PMOVZXBWrm addr:$src)>;
+ (VPMOVZXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
- (PMOVZXBWrm addr:$src)>;
+ (VPMOVZXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
- (PMOVZXWDrm addr:$src)>;
+ (VPMOVZXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
- (PMOVZXWDrm addr:$src)>;
+ (VPMOVZXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
- (PMOVZXDQrm addr:$src)>;
+ (VPMOVZXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
- (PMOVZXDQrm addr:$src)>;
+ (VPMOVZXDQrm addr:$src)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasSSE41] in {
// Common patterns involving scalar load.
def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
- (VPMOVSXBWrm addr:$src)>;
+ (PMOVSXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
- (VPMOVSXBWrm addr:$src)>;
+ (PMOVSXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
- (VPMOVSXWDrm addr:$src)>;
+ (PMOVSXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
- (VPMOVSXWDrm addr:$src)>;
+ (PMOVSXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
- (VPMOVSXDQrm addr:$src)>;
+ (PMOVSXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
- (VPMOVSXDQrm addr:$src)>;
+ (PMOVSXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
- (VPMOVZXBWrm addr:$src)>;
+ (PMOVZXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
- (VPMOVZXBWrm addr:$src)>;
+ (PMOVZXBWrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
- (VPMOVZXWDrm addr:$src)>;
+ (PMOVZXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
- (VPMOVZXWDrm addr:$src)>;
+ (PMOVZXWDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
- (VPMOVZXDQrm addr:$src)>;
+ (PMOVZXDQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
- (VPMOVZXDQrm addr:$src)>;
+ (PMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+}
+
+let Predicates = [HasSSE41] in {
+def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
}
@@ -5655,30 +5791,30 @@ defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
-let Predicates = [HasSSE41] in {
+let Predicates = [HasAVX] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
- (PMOVSXBDrm addr:$src)>;
+ (VPMOVSXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
- (PMOVSXWQrm addr:$src)>;
+ (VPMOVSXWQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
- (PMOVZXBDrm addr:$src)>;
+ (VPMOVZXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
- (PMOVZXWQrm addr:$src)>;
+ (VPMOVZXWQrm addr:$src)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasSSE41] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
- (VPMOVSXBDrm addr:$src)>;
+ (PMOVSXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
- (VPMOVSXWQrm addr:$src)>;
+ (PMOVSXWQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
- (VPMOVZXBDrm addr:$src)>;
+ (PMOVZXBDrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
- (VPMOVZXWQrm addr:$src)>;
+ (PMOVZXWQrm addr:$src)>;
}
multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
@@ -5723,30 +5859,30 @@ defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
-let Predicates = [HasSSE41] in {
+let Predicates = [HasAVX] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
- (PMOVSXBQrm addr:$src)>;
+ (VPMOVSXBQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
- (PMOVZXBQrm addr:$src)>;
+ (VPMOVZXBQrm addr:$src)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasSSE41] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
- (VPMOVSXBQrm addr:$src)>;
+ (PMOVSXBQrm addr:$src)>;
def : Pat<(int_x86_sse41_pmovzxbq
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
- (VPMOVZXBQrm addr:$src)>;
+ (PMOVZXBQrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -5876,13 +6012,13 @@ let ExeDomain = SSEPackedSingle in {
def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
imm:$src2))),
addr:$dst),
- (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[HasSSE41]>;
+ (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasAVX]>;
def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
imm:$src2))),
addr:$dst),
- (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
- Requires<[HasAVX]>;
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasSSE41]>;
//===----------------------------------------------------------------------===//
// SSE4.1 - Insert Instructions
@@ -5992,19 +6128,12 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let ExeDomain = SSEPackedSingle in {
- let Constraints = "$src1 = $dst" in
- defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
let Predicates = [HasAVX] in
defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+ let Constraints = "$src1 = $dst" in
+ defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
}
-def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
- (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
- Requires<[HasAVX]>;
-def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
- (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
- Requires<[HasSSE41]>;
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
@@ -6347,8 +6476,6 @@ let Predicates = [HasAVX] in {
let isCommutable = 0 in
defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
0>, VEX_4V;
- defm VPCMPEQQ : SS41I_binop_rm_int<0x29, "vpcmpeqq", int_x86_sse41_pcmpeqq,
- 0>, VEX_4V;
defm VPMINSB : SS41I_binop_rm_int<0x38, "vpminsb", int_x86_sse41_pminsb,
0>, VEX_4V;
defm VPMINSD : SS41I_binop_rm_int<0x39, "vpminsd", int_x86_sse41_pminsd,
@@ -6367,19 +6494,12 @@ let Predicates = [HasAVX] in {
0>, VEX_4V;
defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
0>, VEX_4V;
-
- def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
- (VPCMPEQQrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
- (VPCMPEQQrm VR128:$src1, addr:$src2)>;
}
let Predicates = [HasAVX2] in {
let isCommutable = 0 in
defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
int_x86_avx2_packusdw>, VEX_4V;
- defm VPCMPEQQ : SS41I_binop_rm_int_y<0x29, "vpcmpeqq",
- int_x86_avx2_pcmpeq_q>, VEX_4V;
defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb",
int_x86_avx2_pmins_b>, VEX_4V;
defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd",
@@ -6398,17 +6518,11 @@ let Predicates = [HasAVX2] in {
int_x86_avx2_pmaxu_w>, VEX_4V;
defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
int_x86_avx2_pmul_dq>, VEX_4V;
-
- def : Pat<(v4i64 (X86pcmpeqq VR256:$src1, VR256:$src2)),
- (VPCMPEQQYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v4i64 (X86pcmpeqq VR256:$src1, (memop addr:$src2))),
- (VPCMPEQQYrm VR256:$src1, addr:$src2)>;
}
let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in
defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
- defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq", int_x86_sse41_pcmpeqq>;
defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb", int_x86_sse41_pminsb>;
defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd", int_x86_sse41_pminsd>;
defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud", int_x86_sse41_pminud>;
@@ -6420,57 +6534,46 @@ let Constraints = "$src1 = $dst" in {
defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>;
}
-let Predicates = [HasSSE41] in {
- def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
- (PCMPEQQrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
- (PCMPEQQrm VR128:$src1, addr:$src2)>;
-}
-
/// SS48I_binop_rm - Simple SSE41 binary operator.
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, bit Is2Addr = 1> {
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
let isCommutable = 1 in
- def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
+ def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
- OpSize;
- def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
+ def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (OpNode VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2))))]>,
- OpSize;
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
}
-/// SS48I_binop_rm - Simple SSE41 binary operator.
-multiclass SS48I_binop_rm_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT> {
- let isCommutable = 1 in
- def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (OpVT (OpNode VR256:$src1, VR256:$src2)))]>,
- OpSize;
- def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, i256mem:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (OpNode VR256:$src1,
- (bc_v8i32 (memopv4i64 addr:$src2))))]>,
- OpSize;
+let Predicates = [HasAVX] in {
+ defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+ defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V;
+ defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V;
}
-let Predicates = [HasAVX] in
- defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
-let Predicates = [HasAVX2] in
- defm VPMULLD : SS48I_binop_rm_y<0x40, "vpmulld", mul, v8i32>, VEX_4V;
-let Constraints = "$src1 = $dst" in
- defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
+let Constraints = "$src1 = $dst" in {
+ defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+ memopv2i64, i128mem>;
+}
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
@@ -6568,7 +6671,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
- SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+ IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -6577,7 +6680,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
[(set RC:$dst,
(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
RC:$src3))],
- SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+ IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
}
let Predicates = [HasAVX] in {
@@ -6705,69 +6808,37 @@ def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
// SSE4.2 - Compare Instructions
//===----------------------------------------------------------------------===//
-/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
-multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
- Intrinsic IntId128, bit Is2Addr = 1> {
- def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr = 1> {
+ def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
OpSize;
- def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2),
+ def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
}
-/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
-multiclass SS42I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
- Intrinsic IntId256> {
- def Yrr : SS428I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
- OpSize;
- def Yrm : SS428I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, i256mem:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (IntId256 VR256:$src1, (memopv4i64 addr:$src2)))]>, OpSize;
-}
-
-let Predicates = [HasAVX] in {
- defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
- 0>, VEX_4V;
-
- def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
- (VPCMPGTQrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
- (VPCMPGTQrm VR128:$src1, addr:$src2)>;
-}
-
-let Predicates = [HasAVX2] in {
- defm VPCMPGTQ : SS42I_binop_rm_int_y<0x37, "vpcmpgtq", int_x86_avx2_pcmpgt_q>,
- VEX_4V;
+let Predicates = [HasAVX] in
+ defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
- def : Pat<(v4i64 (X86pcmpgtq VR256:$src1, VR256:$src2)),
- (VPCMPGTQYrr VR256:$src1, VR256:$src2)>;
- def : Pat<(v4i64 (X86pcmpgtq VR256:$src1, (memop addr:$src2))),
- (VPCMPGTQYrm VR256:$src1, addr:$src2)>;
-}
+let Predicates = [HasAVX2] in
+ defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V;
let Constraints = "$src1 = $dst" in
- defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
-
-let Predicates = [HasSSE42] in {
- def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
- (PCMPGTQrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
- (PCMPGTQrm VR128:$src1, addr:$src2)>;
-}
+ defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+ memopv2i64, i128mem>;
//===----------------------------------------------------------------------===//
// SSE4.2 - String/text Processing Instructions
@@ -6786,8 +6857,9 @@ multiclass pseudo_pcmpistrm<string asm> {
}
let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ let AddedComplexity = 1 in
+ defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
- defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
}
let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
@@ -6823,8 +6895,9 @@ multiclass pseudo_pcmpestrm<string asm> {
}
let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ let AddedComplexity = 1 in
+ defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
- defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
}
let Predicates = [HasAVX],
@@ -7017,8 +7090,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst,
- (IntId128 VR128:$src1,
- (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+ (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
}
// Perform One Round of an AES Encryption/Decryption Flow
@@ -7044,44 +7116,6 @@ let Constraints = "$src1 = $dst" in {
int_x86_aesni_aesdeclast>;
}
-let Predicates = [HasAES] in {
- def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
- (AESENCrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
- (AESENCrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
- (AESENCLASTrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
- (AESENCLASTrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
- (AESDECrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
- (AESDECrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
- (AESDECLASTrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
- (AESDECLASTrm VR128:$src1, addr:$src2)>;
-}
-
-let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in {
- def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
- (VAESENCrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
- (VAESENCrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
- (VAESENCLASTrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
- (VAESENCLASTrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
- (VAESDECrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
- (VAESDECrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
- (VAESDECLASTrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
- (VAESDECLASTrm VR128:$src1, addr:$src2)>;
-}
-
// Perform the AES InvMixColumn Transformation
let Predicates = [HasAVX, HasAES] in {
def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7093,8 +7127,7 @@ let Predicates = [HasAVX, HasAES] in {
def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1),
"vaesimc\t{$src1, $dst|$dst, $src1}",
- [(set VR128:$dst,
- (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
OpSize, VEX;
}
def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7106,8 +7139,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1),
"aesimc\t{$src1, $dst|$dst, $src1}",
- [(set VR128:$dst,
- (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
+ [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
OpSize;
// AES Round Key Generation Assist
@@ -7122,8 +7154,7 @@ let Predicates = [HasAVX, HasAES] in {
(ins i128mem:$src1, i8imm:$src2),
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
- imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
OpSize, VEX;
}
def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7136,8 +7167,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, i8imm:$src2),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
- imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
OpSize;
//===----------------------------------------------------------------------===//
@@ -7146,31 +7176,31 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
// Carry-less Multiplication instructions
let neverHasSideEffects = 1 in {
-let Constraints = "$src1 = $dst" in {
-def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+// AVX carry-less Multiplication instructions
+def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
- "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[]>;
let mayLoad = 1 in
-def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[]>;
-}
-// AVX carry-less Multiplication instructions
-def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+let Constraints = "$src1 = $dst" in {
+def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
- "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[]>;
let mayLoad = 1 in
-def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[]>;
-}
+} // Constraints = "$src1 = $dst"
+} // neverHasSideEffects = 1
multiclass pclmul_alias<string asm, int immop> {
@@ -7242,6 +7272,7 @@ let Predicates = [HasAVX2] in
def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
int_x86_avx2_vbroadcasti128>;
+let Predicates = [HasAVX] in
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
(VBROADCASTF128 addr:$src)>;
@@ -7261,13 +7292,6 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, VEX_4V;
}
-def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
- (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
- (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
- (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-
//===----------------------------------------------------------------------===//
// VEXTRACTF128 - Extract packed floating-point values
//
@@ -7283,12 +7307,14 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
[]>, VEX;
}
+let Predicates = [HasAVX] in {
def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
(VEXTRACTF128rr VR256:$src1, imm:$src2)>;
def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
(VEXTRACTF128rr VR256:$src1, imm:$src2)>;
def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
(VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+}
//===----------------------------------------------------------------------===//
// VMASKMOV - Conditional SIMD Packed Loads and Stores
@@ -7334,8 +7360,8 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
//
multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop_f,
- X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag,
- Intrinsic IntVar, Intrinsic IntImm> {
+ X86MemOperand x86memop_i, PatFrag i_frag,
+ Intrinsic IntVar, ValueType vt> {
def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7349,83 +7375,91 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (IntImm RC:$src1, imm:$src2))]>, VEX;
+ [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX;
+ [(set RC:$dst,
+ (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
}
let ExeDomain = SSEPackedSingle in {
defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
- memopv4f32, memopv2i64,
- int_x86_avx_vpermilvar_ps,
- int_x86_avx_vpermil_ps>;
+ memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
- memopv8f32, memopv4i64,
- int_x86_avx_vpermilvar_ps_256,
- int_x86_avx_vpermil_ps_256>;
+ memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>;
}
let ExeDomain = SSEPackedDouble in {
defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
- memopv2f64, memopv2i64,
- int_x86_avx_vpermilvar_pd,
- int_x86_avx_vpermil_pd>;
+ memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
- memopv4f64, memopv4i64,
- int_x86_avx_vpermilvar_pd_256,
- int_x86_avx_vpermil_pd_256>;
+ memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>;
}
-def : Pat<(v8f32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
- (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4f64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
- (VPERMILPDYri VR256:$src1, imm:$imm)>;
+let Predicates = [HasAVX] in {
def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
(VPERMILPSYri VR256:$src1, imm:$imm)>;
def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
(VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8f32 (X86VPermilp (memopv8f32 addr:$src1), (i8 imm:$imm))),
- (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4f64 (X86VPermilp (memopv4f64 addr:$src1), (i8 imm:$imm))),
- (VPERMILPDYmi addr:$src1, imm:$imm)>;
def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
(i8 imm:$imm))),
(VPERMILPSYmi addr:$src1, imm:$imm)>;
def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
(VPERMILPDYmi addr:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
+ (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+ (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
//===----------------------------------------------------------------------===//
// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
//
-let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+let ExeDomain = SSEPackedSingle in {
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, VEX_4V;
-let mayLoad = 1 in
+ [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, i8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, VEX_4V;
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V;
}
-def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, VR256:$src2, imm:$src3),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(int_x86_avx_vperm2f128_ps_256
- VR256:$src1, (memopv8f32 addr:$src2), imm:$src3),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vperm2f128_pd_256
- VR256:$src1, (memopv4f64 addr:$src2), imm:$src3),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vperm2f128_si_256
- VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), imm:$src3),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
+ (memopv8f32 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+ (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+ (memopv4i64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+ (memopv4f64 addr:$src2), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+ (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+ (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
//===----------------------------------------------------------------------===//
// VZERO - Zero YMM registers
@@ -7564,6 +7598,7 @@ let Predicates = [HasAVX2] in {
}
// AVX1 broadcast patterns
+let Predicates = [HasAVX] in {
def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
(VBROADCASTSSYrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
@@ -7577,6 +7612,7 @@ def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// VPERM - Permute instructions
@@ -7626,25 +7662,22 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
//===----------------------------------------------------------------------===//
// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
//
+let AddedComplexity = 1 in {
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst,
- (int_x86_avx2_vperm2i128 VR256:$src1, VR256:$src2, imm:$src3))]>,
- VEX_4V;
+ [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ (i8 imm:$src3))))]>, VEX_4V;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, i8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst,
- (int_x86_avx2_vperm2i128 VR256:$src1, (memopv4i64 addr:$src2),
- imm:$src3))]>,
- VEX_4V;
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+ (i8 imm:$src3)))]>, VEX_4V;
+}
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2], AddedComplexity = 1 in {
def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7659,44 +7692,8 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
(i8 imm:$imm))),
(VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
- (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
}
-// AVX1 patterns
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
- (memopv8f32 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
- (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
- (memopv4i64 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
- (memopv4f64 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
- (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
- (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-
//===----------------------------------------------------------------------===//
// VINSERTI128 - Insert packed integer values
@@ -7734,6 +7731,7 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
}
// AVX1 patterns
+let Predicates = [HasAVX] in {
def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
(i32 imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
@@ -7758,6 +7756,7 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
(i32 imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
(INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
//===----------------------------------------------------------------------===//
// VEXTRACTI128 - Extract packed integer values
@@ -7793,6 +7792,7 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
}
// AVX1 patterns
+let Predicates = [HasAVX] in {
def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
(v4f32 (VEXTRACTF128rr
(v8f32 VR256:$src1),
@@ -7817,6 +7817,7 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
(v16i8 (VEXTRACTF128rr
(v32i8 VR256:$src1),
(EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+}
//===----------------------------------------------------------------------===//
// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 0000000..757dcd0
--- /dev/null
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+ "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+ "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+ "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+ "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 58cf6e3..bdeb63f 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -1,10 +1,10 @@
-//===- X86InstrShiftRotate.td - Shift and Rotate Instrs ----*- tablegen -*-===//
-//
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the shift and rotate instructions.
@@ -19,44 +19,46 @@ let Constraints = "$src1 = $dst" in {
let Uses = [CL] in {
def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
"shl{b}\t{%cl, $dst|$dst, CL}",
- [(set GR8:$dst, (shl GR8:$src1, CL))]>;
+ [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
"shl{w}\t{%cl, $dst|$dst, CL}",
- [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize;
+ [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
"shl{l}\t{%cl, $dst|$dst, CL}",
- [(set GR32:$dst, (shl GR32:$src1, CL))]>;
+ [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
- "shl{q}\t{%cl, $dst|$dst, %CL}",
- [(set GR64:$dst, (shl GR64:$src1, CL))]>;
+ "shl{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
} // Uses = [CL]
def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+ OpSize;
def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"shl{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
(ins GR64:$src1, i8imm:$src2),
"shl{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
// NOTE: We don't include patterns for shifts of a register by one, because
// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
- "shl{b}\t$dst", []>;
+ "shl{b}\t$dst", [], IIC_SR>;
def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
- "shl{w}\t$dst", []>, OpSize;
+ "shl{w}\t$dst", [], IIC_SR>, OpSize;
def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
- "shl{l}\t$dst", []>;
+ "shl{l}\t$dst", [], IIC_SR>;
def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
- "shl{q}\t$dst", []>;
+ "shl{q}\t$dst", [], IIC_SR>;
} // isConvertibleToThreeAddress = 1
} // Constraints = "$src = $dst"
@@ -66,223 +68,266 @@ def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
let Uses = [CL] in {
def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
"shl{b}\t{%cl, $dst|$dst, CL}",
- [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
"shl{w}\t{%cl, $dst|$dst, CL}",
- [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ OpSize;
def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
"shl{l}\t{%cl, $dst|$dst, CL}",
- [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
- "shl{q}\t{%cl, $dst|$dst, %CL}",
- [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>;
+ "shl{q}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
"shl{b}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
"shl{w}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
OpSize;
def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
"shl{l}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
"shl{q}\t{$src, $dst|$dst, $src}",
- [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
// Shift by 1
def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
"shl{b}\t$dst",
- [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
"shl{w}\t$dst",
- [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
OpSize;
def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
"shl{l}\t$dst",
- [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t$dst",
- [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
let Constraints = "$src1 = $dst" in {
let Uses = [CL] in {
def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
"shr{b}\t{%cl, $dst|$dst, CL}",
- [(set GR8:$dst, (srl GR8:$src1, CL))]>;
+ [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
"shr{w}\t{%cl, $dst|$dst, CL}",
- [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize;
+ [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
"shr{l}\t{%cl, $dst|$dst, CL}",
- [(set GR32:$dst, (srl GR32:$src1, CL))]>;
+ [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
- "shr{q}\t{%cl, $dst|$dst, %CL}",
- [(set GR64:$dst, (srl GR64:$src1, CL))]>;
+ "shr{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
}
def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
"shr{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"shr{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>, OpSize;
def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"shr{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
"shr{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
// Shift right by 1
def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
"shr{b}\t$dst",
- [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
"shr{w}\t$dst",
- [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize;
def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
"shr{l}\t$dst",
- [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>;
def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
"shr{q}\t$dst",
- [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
} // Constraints = "$src = $dst"
let Uses = [CL] in {
def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
"shr{b}\t{%cl, $dst|$dst, CL}",
- [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
"shr{w}\t{%cl, $dst|$dst, CL}",
- [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
OpSize;
def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
"shr{l}\t{%cl, $dst|$dst, CL}",
- [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
- "shr{q}\t{%cl, $dst|$dst, %CL}",
- [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>;
+ "shr{q}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
"shr{b}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
"shr{w}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
OpSize;
def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
"shr{l}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
"shr{q}\t{$src, $dst|$dst, $src}",
- [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
// Shift by 1
def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
"shr{b}\t$dst",
- [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
"shr{w}\t$dst",
- [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,OpSize;
def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
"shr{l}\t$dst",
- [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t$dst",
- [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
let Constraints = "$src1 = $dst" in {
let Uses = [CL] in {
def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
"sar{b}\t{%cl, $dst|$dst, CL}",
- [(set GR8:$dst, (sra GR8:$src1, CL))]>;
+ [(set GR8:$dst, (sra GR8:$src1, CL))],
+ IIC_SR>;
def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
"sar{w}\t{%cl, $dst|$dst, CL}",
- [(set GR16:$dst, (sra GR16:$src1, CL))]>, OpSize;
+ [(set GR16:$dst, (sra GR16:$src1, CL))],
+ IIC_SR>, OpSize;
def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
"sar{l}\t{%cl, $dst|$dst, CL}",
- [(set GR32:$dst, (sra GR32:$src1, CL))]>;
+ [(set GR32:$dst, (sra GR32:$src1, CL))],
+ IIC_SR>;
def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
- "sar{q}\t{%cl, $dst|$dst, %CL}",
- [(set GR64:$dst, (sra GR64:$src1, CL))]>;
+ "sar{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (sra GR64:$src1, CL))],
+ IIC_SR>;
}
def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
"sar{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"sar{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
OpSize;
def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"sar{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
(ins GR64:$src1, i8imm:$src2),
"sar{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
// Shift by 1
def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
"sar{b}\t$dst",
- [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+ IIC_SR>;
def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
"sar{w}\t$dst",
- [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
"sar{l}\t$dst",
- [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+ IIC_SR>;
def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
"sar{q}\t$dst",
- [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+ IIC_SR>;
} // Constraints = "$src = $dst"
let Uses = [CL] in {
def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
"sar{b}\t{%cl, $dst|$dst, CL}",
- [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
"sar{w}\t{%cl, $dst|$dst, CL}",
- [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
"sar{l}\t{%cl, $dst|$dst, CL}",
- [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>;
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
- "sar{q}\t{%cl, $dst|$dst, %CL}",
- [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>;
+ "sar{q}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
}
def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
"sar{b}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
"sar{w}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
OpSize;
def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
"sar{l}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
"sar{q}\t{$src, $dst|$dst, $src}",
- [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
// Shift by 1
def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
"sar{b}\t$dst",
- [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
"sar{w}\t$dst",
- [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
OpSize;
def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
"sar{l}\t$dst",
- [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t$dst",
- [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
//===----------------------------------------------------------------------===//
// Rotate instructions
@@ -290,125 +335,125 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
let Constraints = "$src1 = $dst" in {
def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t$dst", []>;
+ "rcl{b}\t$dst", [], IIC_SR>;
def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
- "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t$dst", []>, OpSize;
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize;
def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
- "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t$dst", []>;
+ "rcl{l}\t$dst", [], IIC_SR>;
def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
- "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t$dst", []>;
+ "rcl{q}\t$dst", [], IIC_SR>;
def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
- "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t$dst", []>;
+ "rcr{b}\t$dst", [], IIC_SR>;
def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
- "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t$dst", []>, OpSize;
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize;
def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
- "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t$dst", []>;
+ "rcr{l}\t$dst", [], IIC_SR>;
def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
- "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t$dst", []>;
+ "rcr{q}\t$dst", [], IIC_SR>;
def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
- "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
} // Constraints = "$src = $dst"
def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t$dst", []>;
+ "rcl{b}\t$dst", [], IIC_SR>;
def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
- "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t$dst", []>, OpSize;
+ "rcl{w}\t$dst", [], IIC_SR>, OpSize;
def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
- "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t$dst", []>;
+ "rcl{l}\t$dst", [], IIC_SR>;
def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
- "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t$dst", []>;
+ "rcl{q}\t$dst", [], IIC_SR>;
def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
- "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t$dst", []>;
+ "rcr{b}\t$dst", [], IIC_SR>;
def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
- "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t$dst", []>, OpSize;
+ "rcr{w}\t$dst", [], IIC_SR>, OpSize;
def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
- "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t$dst", []>;
+ "rcr{l}\t$dst", [], IIC_SR>;
def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
- "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t$dst", []>;
+ "rcr{q}\t$dst", [], IIC_SR>;
def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
- "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in {
def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+ "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+ "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
}
let Constraints = "$src1 = $dst" in {
@@ -416,179 +461,217 @@ let Constraints = "$src1 = $dst" in {
let Uses = [CL] in {
def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"rol{b}\t{%cl, $dst|$dst, CL}",
- [(set GR8:$dst, (rotl GR8:$src1, CL))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"rol{w}\t{%cl, $dst|$dst, CL}",
- [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize;
+ [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"rol{l}\t{%cl, $dst|$dst, CL}",
- [(set GR32:$dst, (rotl GR32:$src1, CL))]>;
+ [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
- "rol{q}\t{%cl, $dst|$dst, %CL}",
- [(set GR64:$dst, (rotl GR64:$src1, CL))]>;
+ "rol{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
}
def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>,
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
OpSize;
def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
(ins GR64:$src1, i8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
// Rotate by 1
def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
"rol{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+ IIC_SR>;
def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"rol{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"rol{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+ IIC_SR>;
def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
"rol{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+ IIC_SR>;
} // Constraints = "$src = $dst"
let Uses = [CL] in {
def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
"rol{b}\t{%cl, $dst|$dst, CL}",
- [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
"rol{w}\t{%cl, $dst|$dst, CL}",
- [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
"rol{l}\t{%cl, $dst|$dst, CL}",
- [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
- "rol{q}\t{%cl, $dst|$dst, %CL}",
- [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>;
+ "rol{q}\t{%cl, $dst|$dst, %cl}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
}
def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
"rol{b}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
"rol{w}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>,
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>,
OpSize;
def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
"rol{l}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
"rol{q}\t{$src1, $dst|$dst, $src1}",
- [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>;
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+ IIC_SR>;
// Rotate by 1
def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
"rol{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
"rol{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
OpSize;
def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
"rol{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
let Constraints = "$src1 = $dst" in {
let Uses = [CL] in {
def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t{%cl, $dst|$dst, CL}",
- [(set GR8:$dst, (rotr GR8:$src1, CL))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t{%cl, $dst|$dst, CL}",
- [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize;
+ [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t{%cl, $dst|$dst, CL}",
- [(set GR32:$dst, (rotr GR32:$src1, CL))]>;
+ [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
- "ror{q}\t{%cl, $dst|$dst, %CL}",
- [(set GR64:$dst, (rotr GR64:$src1, CL))]>;
+ "ror{q}\t{%cl, $dst|$dst, CL}",
+ [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
}
def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
"ror{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
"ror{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>,
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+ IIC_SR>,
OpSize;
def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
"ror{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
(ins GR64:$src1, i8imm:$src2),
"ror{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+ IIC_SR>;
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+ IIC_SR>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+ IIC_SR>, OpSize;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+ IIC_SR>;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+ IIC_SR>;
} // Constraints = "$src = $dst"
let Uses = [CL] in {
def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t{%cl, $dst|$dst, CL}",
- [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t{%cl, $dst|$dst, CL}",
- [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+ IIC_SR>, OpSize;
def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t{%cl, $dst|$dst, CL}",
- [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>;
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
- "ror{q}\t{%cl, $dst|$dst, %CL}",
- [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>;
+ "ror{q}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+ IIC_SR>;
}
def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
"ror{b}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
"ror{w}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>,
OpSize;
def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
"ror{l}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
"ror{q}\t{$src, $dst|$dst, $src}",
- [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+ IIC_SR>;
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t$dst",
- [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t$dst",
- [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>,
OpSize;
def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t$dst",
- [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
- [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+ IIC_SR>;
//===----------------------------------------------------------------------===//
@@ -601,30 +684,36 @@ let Uses = [CL] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
- [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
TB, OpSize;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
- [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+ IIC_SHD16_REG_CL>,
TB, OpSize;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
- [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB;
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
- [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB;
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+ IIC_SHD32_REG_CL>, TB;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
- [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
- [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+ IIC_SHD64_REG_CL>,
TB;
}
@@ -634,42 +723,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
(ins GR16:$src1, GR16:$src2, i8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
TB, OpSize;
def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, i8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
TB, OpSize;
def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, i8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
TB;
def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, i8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
TB;
def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, i8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
TB;
def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, i8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
- (i8 imm:$src3)))]>,
+ (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
TB;
}
} // Constraints = "$src = $dst"
@@ -678,68 +767,74 @@ let Uses = [CL] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)]>, TB, OpSize;
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)]>, TB, OpSize;
+ addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
- addr:$dst)]>, TB;
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
- addr:$dst)]>, TB;
+ addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)]>, TB;
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)]>, TB;
+ addr:$dst)], IIC_SHD64_MEM_CL>, TB;
}
def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
TB, OpSize;
def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD16_MEM_IM>,
TB, OpSize;
def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
TB;
def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD32_MEM_IM>,
TB;
def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
TB;
def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ (i8 imm:$src3)), addr:$dst)],
+ IIC_SHD64_MEM_IM>,
TB;
} // Defs = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 1b43838..8843848 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -1,10 +1,10 @@
-//===- X86InstrSystem.td - System Instructions -------------*- tablegen -*-===//
-//
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 instructions that are generally used in
@@ -214,18 +214,18 @@ def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
- "str{w}\t{$dst}", []>, TB, OpSize;
+ "str{w}\t$dst", []>, TB, OpSize;
def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
- "str{l}\t{$dst}", []>, TB;
+ "str{l}\t$dst", []>, TB;
def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
- "str{q}\t{$dst}", []>, TB;
+ "str{q}\t$dst", []>, TB;
def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
- "str{w}\t{$dst}", []>, TB;
+ "str{w}\t$dst", []>, TB;
def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
- "ltr{w}\t{$src}", []>, TB;
+ "ltr{w}\t$src", []>, TB;
def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
- "ltr{w}\t{$src}", []>, TB;
+ "ltr{w}\t$src", []>, TB;
def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
"push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 74477cd..6a8f0c8 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -1,10 +1,10 @@
-//===- X86InstrVMX.td - VMX Instruction Set Extension ------*- tablegen -*-===//
-//
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file describes the instructions that make up the Intel VMX instruction
@@ -33,6 +33,8 @@ def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
"vmclear\t$vmcs", []>, OpSize, TB;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
// 0F 01 C2
def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
// 0F 01 C3
@@ -60,5 +62,5 @@ def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
// 0F 01 C4
def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
- "vmxon\t{$vmxon}", []>, XS;
+ "vmxon\t$vmxon", []>, XS;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 64cc44d..65bbcb5 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -1,4 +1,4 @@
-//====- X86InstrXOP.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,90 +11,123 @@
//
//===----------------------------------------------------------------------===//
-multiclass xop2op<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, VEX;
- def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, VEX;
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
}
let isAsmParserOnly = 1 in {
- defm VPHSUBWD : xop2op<0xE2, "vphsubwd", f128mem>;
- defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", f128mem>;
- defm VPHSUBBW : xop2op<0xE1, "vphsubbw", f128mem>;
- defm VPHADDWQ : xop2op<0xC7, "vphaddwq", f128mem>;
- defm VPHADDWD : xop2op<0xC6, "vphaddwd", f128mem>;
- defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", f128mem>;
- defm VPHADDUWD : xop2op<0xD6, "vphadduwd", f128mem>;
- defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", f128mem>;
- defm VPHADDUBW : xop2op<0xD1, "vphaddubw", f128mem>;
- defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", f128mem>;
- defm VPHADDUBD : xop2op<0xD2, "vphaddubd", f128mem>;
- defm VPHADDDQ : xop2op<0xCB, "vphadddq", f128mem>;
- defm VPHADDBW : xop2op<0xC1, "vphaddbw", f128mem>;
- defm VPHADDBQ : xop2op<0xC3, "vphaddbq", f128mem>;
- defm VPHADDBD : xop2op<0xC2, "vphaddbd", f128mem>;
- defm VFRCZSS : xop2op<0x82, "vfrczss", f32mem>;
- defm VFRCZSD : xop2op<0x83, "vfrczsd", f64mem>;
- defm VFRCZPS : xop2op<0x80, "vfrczps", f128mem>;
- defm VFRCZPD : xop2op<0x81, "vfrczpd", f128mem>;
-}
-
-multiclass xop2op256<bits<8> opc, string OpcodeStr> {
+ defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+ defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+ defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+ defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+ defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+ defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+ defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+ defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+ defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+ defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+ defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+ defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+ defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+ defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+ defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
+ defm VFRCZPS : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+ defm VFRCZPD : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
+
+// Scalar load 2 addr operand instructions
+let Constraints = "$src1 = $dst" in {
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
+ VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
+ memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (Int VR128:$src1,
+ (bitconvert mem_cpat:$src2)))]>, VEX;
+}
+
+} // Constraints = "$src1 = $dst"
+
+let isAsmParserOnly = 1 in {
+ defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+ ssmem, sse_load_f32>;
+ defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+ sdmem, sse_load_f64>;
+}
+
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, VEX, VEX_L;
+ [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, VEX;
+ [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
}
let isAsmParserOnly = 1 in {
- defm VFRCZPS : xop2op256<0x80, "vfrczps">;
- defm VFRCZPD : xop2op256<0x81, "vfrczpd">;
+ defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
+ memopv8f32>;
+ defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
+ memopv4f64>;
}
-multiclass xop3op<bits<8> opc, string OpcodeStr> {
+multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX_4VOp3;
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX_4V, VEX_W;
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
+ VEX_4V, VEX_W;
def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins f128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX_4VOp3;
+ [(set VR128:$dst,
+ (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
+ VEX_4VOp3;
}
let isAsmParserOnly = 1 in {
- defm VPSHLW : xop3op<0x95, "vpshlw">;
- defm VPSHLQ : xop3op<0x97, "vpshlq">;
- defm VPSHLD : xop3op<0x96, "vpshld">;
- defm VPSHLB : xop3op<0x94, "vpshlb">;
- defm VPSHAW : xop3op<0x99, "vpshaw">;
- defm VPSHAQ : xop3op<0x9B, "vpshaq">;
- defm VPSHAD : xop3op<0x9A, "vpshad">;
- defm VPSHAB : xop3op<0x98, "vpshab">;
- defm VPROTW : xop3op<0x91, "vprotw">;
- defm VPROTQ : xop3op<0x93, "vprotq">;
- defm VPROTD : xop3op<0x92, "vprotd">;
- defm VPROTB : xop3op<0x90, "vprotb">;
+ defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+ defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+ defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+ defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+ defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+ defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+ defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
}
multiclass xop3opimm<bits<8> opc, string OpcodeStr> {
- def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, i8imm:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX;
- def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins f128mem:$src1, i8imm:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX;
+ let neverHasSideEffects = 1 in {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX;
+ let mayLoad = 1 in
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins f128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX;
+ }
}
let isAsmParserOnly = 1 in {
@@ -105,139 +138,170 @@ let isAsmParserOnly = 1 in {
}
// Instruction where second source can be memory, but third must be register
-multiclass xop4opm2<bits<8> opc, string OpcodeStr> {
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ VR128:$src3))]>, VEX_4V, VEX_I8IMM;
}
let isAsmParserOnly = 1 in {
- defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd">;
- defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd">;
- defm VPMACSWW : xop4opm2<0x95, "vpmacsww">;
- defm VPMACSWD : xop4opm2<0x96, "vpmacswd">;
- defm VPMACSSWW : xop4opm2<0x85, "vpmacssww">;
- defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd">;
- defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql">;
- defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh">;
- defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd">;
- defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql">;
- defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh">;
- defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd">;
+ defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+ defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+ defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+ defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+ defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+ defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+ defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+ defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+ defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+ defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+ defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+ defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
}
// Instruction where second source can be memory, third must be imm8
-multiclass xop4opimm<bits<8> opc, string OpcodeStr> {
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType VT> {
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V;
+ [(set VR128:$dst,
+ (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V;
+ [(set VR128:$dst,
+ (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ imm:$src3)))]>, VEX_4V;
}
let isAsmParserOnly = 1 in {
- defm VPCOMW : xop4opimm<0xCD, "vpcomw">;
- defm VPCOMUW : xop4opimm<0xED, "vpcomuw">;
- defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq">;
- defm VPCOMUD : xop4opimm<0xEE, "vpcomud">;
- defm VPCOMUB : xop4opimm<0xEC, "vpcomub">;
- defm VPCOMQ : xop4opimm<0xCF, "vpcomq">;
- defm VPCOMD : xop4opimm<0xCE, "vpcomd">;
- defm VPCOMB : xop4opimm<0xCC, "vpcomb">;
+ defm VPCOMB : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>;
+ defm VPCOMW : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>;
+ defm VPCOMD : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>;
+ defm VPCOMQ : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>;
+ defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>;
+ defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>;
+ defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>;
+ defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>;
}
// Instruction where either second or third source can be memory
-multiclass xop4op<bits<8> opc, string OpcodeStr> {
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+ VEX_4V, VEX_I8IMM;
def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM, XOP_W;
+ [(set VR128:$dst,
+ (Int VR128:$src1, VR128:$src2,
+ (bitconvert (memopv2i64 addr:$src3))))]>,
+ VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR128:$dst,
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ VR128:$src3))]>,
+ VEX_4V, VEX_I8IMM;
}
let isAsmParserOnly = 1 in {
- defm VPPERM : xop4op<0xA3, "vpperm">;
- defm VPCMOV : xop4op<0xA2, "vpcmov">;
+ defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+ defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
}
-multiclass xop4op256<bits<8> opc, string OpcodeStr> {
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+ VEX_4V, VEX_I8IMM;
def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM, XOP_W;
+ [(set VR256:$dst,
+ (Int VR256:$src1, VR256:$src2,
+ (bitconvert (memopv4i64 addr:$src3))))]>,
+ VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_4V, VEX_I8IMM;
+ [(set VR256:$dst,
+ (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
+ VR256:$src3))]>,
+ VEX_4V, VEX_I8IMM;
}
let isAsmParserOnly = 1 in {
- defm VPCMOV : xop4op256<0xA2, "vpcmov">;
+ defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
}
-multiclass xop5op<bits<8> opc, string OpcodeStr> {
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+ Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>;
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, XOP_W;
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4;
def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>;
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>;
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>;
def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, XOP_W;
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+ VEX_W, MemOp4;
def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>;
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>;
}
-let isAsmParserOnly = 1 in {
- defm VPERMIL2PD : xop5op<0x49, "vpermil2pd">;
- defm VPERMIL2PS : xop5op<0x48, "vpermil2ps">;
-}
+defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+ int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>;
+defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+ int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>;
+
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 2145a33..0168d12 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -300,7 +300,10 @@ extern "C" {
SIZE(X86CompilationCallback_SSE)
);
# else
- void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+ // the following function is called only from this translation unit,
+ // unless we are under 64bit Windows with MSC, where there is
+ // no support for inline assembly
+ static void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
_declspec(naked) void X86CompilationCallback(void) {
__asm {
@@ -571,6 +574,5 @@ char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
return TLSOffset;
#else
llvm_unreachable("Cannot allocate thread local storage on this arch!");
- return 0;
#endif
}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 238420c..c76d3cc 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -1,4 +1,4 @@
-//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9232196..a7a5c56 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -154,6 +154,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
Ctx),
Ctx);
break;
+ case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
@@ -230,7 +231,8 @@ static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
/// a short fixed-register form.
static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
unsigned ImmOp = Inst.getNumOperands() - 1;
- assert(Inst.getOperand(0).isReg() && Inst.getOperand(ImmOp).isImm() &&
+ assert(Inst.getOperand(0).isReg() &&
+ (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
Inst.getNumOperands() == 2) && "Unexpected instruction!");
@@ -335,6 +337,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = LowerSymbolOperand(MO,
AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
break;
+ case MachineOperand::MO_RegisterMask:
+ // Ignore call clobbers.
+ continue;
}
OutMI.addOperand(MCOp);
@@ -373,6 +378,7 @@ ReSimplify:
case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
+ case X86::AVX2_SET0: LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break;
case X86::MOV16r0:
LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0
@@ -383,14 +389,12 @@ ReSimplify:
LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
break;
- // TAILJMPr64, [WIN]CALL64r, [WIN]CALL64pcrel32 - These instructions have
- // register inputs modeled as normal uses instead of implicit uses. As such,
- // truncate off all but the first operand (the callee). FIXME: Change isel.
+ // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+ // inputs modeled as normal uses instead of implicit uses. As such, truncate
+ // off all but the first operand (the callee). FIXME: Change isel.
case X86::TAILJMPr64:
case X86::CALL64r:
- case X86::CALL64pcrel32:
- case X86::WINCALL64r:
- case X86::WINCALL64pcrel32: {
+ case X86::CALL64pcrel32: {
unsigned Opcode = OutMI.getOpcode();
MCOperand Saved = OutMI.getOperand(0);
OutMI = MCInst();
@@ -412,7 +416,7 @@ ReSimplify:
case X86::TAILJMPd64: {
unsigned Opcode;
switch (OutMI.getOpcode()) {
- default: assert(0 && "Invalid opcode");
+ default: llvm_unreachable("Invalid opcode");
case X86::TAILJMPr: Opcode = X86::JMP32r; break;
case X86::TAILJMPd:
case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
diff --git a/lib/Target/X86/X86MCInstLower.h b/lib/Target/X86/X86MCInstLower.h
index 0210072..40df3db 100644
--- a/lib/Target/X86/X86MCInstLower.h
+++ b/lib/Target/X86/X86MCInstLower.h
@@ -1,4 +1,4 @@
-//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//===-- X86MCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 0000000..568dc22
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index b0bb313..c747109 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,10 +1,10 @@
-//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
-//
+//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
-//
+//
//===----------------------------------------------------------------------===//
//
// This file declares X86-specific per-machine-function information.
@@ -21,6 +21,8 @@ namespace llvm {
/// X86MachineFunctionInfo - This class is derived from MachineFunction and
/// contains private X86 target-specific information for each MachineFunction.
class X86MachineFunctionInfo : public MachineFunctionInfo {
+ virtual void anchor();
+
/// ForceFramePointer - True if the function is required to use of frame
/// pointer for reasons other than it containing dynamic allocation or
/// that FP eliminatation is turned off. For example, Cygwin main function
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 4e80432..93e2744 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -127,121 +127,13 @@ const TargetRegisterClass *
X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
const TargetRegisterClass *B,
unsigned SubIdx) const {
- switch (SubIdx) {
- default: return 0;
- case X86::sub_8bit:
- if (B == &X86::GR8RegClass) {
- if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8)
- return A;
- } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
- A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass ||
- A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_ABCDRegClass;
- else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
- A == &X86::GR32_NOREXRegClass ||
- A == &X86::GR32_NOSPRegClass)
- return &X86::GR32_ABCDRegClass;
- else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
- A == &X86::GR16_NOREXRegClass)
- return &X86::GR16_ABCDRegClass;
- } else if (B == &X86::GR8_NOREXRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_NOREXRegClass;
- else if (A == &X86::GR64_ABCDRegClass)
- return &X86::GR64_ABCDRegClass;
- else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
- A == &X86::GR32_NOSPRegClass)
- return &X86::GR32_NOREXRegClass;
- else if (A == &X86::GR32_ABCDRegClass)
- return &X86::GR32_ABCDRegClass;
- else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass)
- return &X86::GR16_NOREXRegClass;
- else if (A == &X86::GR16_ABCDRegClass)
- return &X86::GR16_ABCDRegClass;
- }
- break;
- case X86::sub_8bit_hi:
- if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass))
- switch (A->getSize()) {
- case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass);
- case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass);
- case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass);
- default: return 0;
- }
- break;
- case X86::sub_16bit:
- if (B == &X86::GR16RegClass) {
- if (A->getSize() == 4 || A->getSize() == 8)
- return A;
- } else if (B == &X86::GR16_ABCDRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
- A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass ||
- A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_ABCDRegClass;
- else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
- A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
- return &X86::GR32_ABCDRegClass;
- } else if (B == &X86::GR16_NOREXRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_NOREXRegClass;
- else if (A == &X86::GR64_ABCDRegClass)
- return &X86::GR64_ABCDRegClass;
- else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
- A == &X86::GR32_NOSPRegClass)
- return &X86::GR32_NOREXRegClass;
- else if (A == &X86::GR32_ABCDRegClass)
- return &X86::GR64_ABCDRegClass;
- }
- break;
- case X86::sub_32bit:
- if (B == &X86::GR32RegClass) {
- if (A->getSize() == 8)
- return A;
- } else if (B == &X86::GR32_NOSPRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_NOSPRegClass)
- return &X86::GR64_NOSPRegClass;
- if (A->getSize() == 8)
- return getCommonSubClass(A, &X86::GR64_NOSPRegClass);
- } else if (B == &X86::GR32_ABCDRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
- A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass ||
- A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_ABCDRegClass;
- } else if (B == &X86::GR32_NOREXRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass)
- return &X86::GR64_NOREXRegClass;
- else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_NOREX_NOSPRegClass;
- else if (A == &X86::GR64_ABCDRegClass)
- return &X86::GR64_ABCDRegClass;
- } else if (B == &X86::GR32_NOREX_NOSPRegClass) {
- if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
- A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
- return &X86::GR64_NOREX_NOSPRegClass;
- else if (A == &X86::GR64_ABCDRegClass)
- return &X86::GR64_ABCDRegClass;
- }
- break;
- case X86::sub_ss:
- if (B == &X86::FR32RegClass)
- return A;
- break;
- case X86::sub_sd:
- if (B == &X86::FR64RegClass)
- return A;
- break;
- case X86::sub_xmm:
- if (B == &X86::VR128RegClass)
- return A;
- break;
+ // The sub_8bit sub-register index is more constrained in 32-bit mode.
+ if (!Is64Bit && SubIdx == X86::sub_8bit) {
+ A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+ if (!A)
+ return 0;
}
- return 0;
+ return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
}
const TargetRegisterClass*
@@ -334,7 +226,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
-const unsigned *
+const uint16_t *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
bool callsEHReturn = false;
bool ghcCall = false;
@@ -345,45 +237,29 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
}
- static const unsigned GhcCalleeSavedRegs[] = {
- 0
- };
-
- static const unsigned CalleeSavedRegs32Bit[] = {
- X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
- };
-
- static const unsigned CalleeSavedRegs32EHRet[] = {
- X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
- };
-
- static const unsigned CalleeSavedRegs64Bit[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
-
- static const unsigned CalleeSavedRegs64EHRet[] = {
- X86::RAX, X86::RDX, X86::RBX, X86::R12,
- X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
-
- static const unsigned CalleeSavedRegsWin64[] = {
- X86::RBX, X86::RBP, X86::RDI, X86::RSI,
- X86::R12, X86::R13, X86::R14, X86::R15,
- X86::XMM6, X86::XMM7, X86::XMM8, X86::XMM9,
- X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13,
- X86::XMM14, X86::XMM15, 0
- };
-
- if (ghcCall) {
- return GhcCalleeSavedRegs;
- } else if (Is64Bit) {
+ if (ghcCall)
+ return CSR_Ghc_SaveList;
+ if (Is64Bit) {
if (IsWin64)
- return CalleeSavedRegsWin64;
- else
- return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit);
- } else {
- return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit);
+ return CSR_Win64_SaveList;
+ if (callsEHReturn)
+ return CSR_64EHRet_SaveList;
+ return CSR_64_SaveList;
}
+ if (callsEHReturn)
+ return CSR_32EHRet_SaveList;
+ return CSR_32_SaveList;
+}
+
+const uint32_t*
+X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+ if (CC == CallingConv::GHC)
+ return CSR_Ghc_RegMask;
+ if (!Is64Bit)
+ return CSR_32_RegMask;
+ if (IsWin64)
+ return CSR_Win64_RegMask;
+ return CSR_64_RegMask;
}
BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -428,16 +304,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (unsigned n = 0; n != 8; ++n) {
// R8, R9, ...
- const unsigned GPR64[] = {
+ static const uint16_t GPR64[] = {
X86::R8, X86::R9, X86::R10, X86::R11,
X86::R12, X86::R13, X86::R14, X86::R15
};
- for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI)
+ for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI)
Reserved.set(Reg);
// XMM8, XMM9, ...
assert(X86::XMM15 == X86::XMM8+7);
- for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
+ for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
++AI)
Reserved.set(Reg);
}
@@ -650,12 +526,10 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
unsigned X86RegisterInfo::getEHExceptionRegister() const {
llvm_unreachable("What is the exception register");
- return 0;
}
unsigned X86RegisterInfo::getEHHandlerRegister() const {
llvm_unreachable("What is the exception handler register");
- return 0;
}
namespace llvm {
@@ -837,8 +711,6 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
return X86::R15;
}
}
-
- return Reg;
}
}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 7d39c68..bee0393 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -1,4 +1,4 @@
-//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -95,7 +95,8 @@ public:
/// getCalleeSavedRegs - Return a null-terminated list of all of the
/// callee-save registers on this target.
- const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+ const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+ const uint32_t *getCallPreservedMask(CallingConv::ID) const;
/// getReservedRegs - Returns a bitset indexed by physical register number
/// indicating if a register is a special register that has particular uses and
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 9a7db36..5263a49 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -70,7 +70,7 @@ let Namespace = "X86" in {
def BH : Register<"bh">;
// 16-bit registers
- let SubRegIndices = [sub_8bit, sub_8bit_hi] in {
+ let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
def AX : RegisterWithSubRegs<"ax", [AL,AH]>;
def DX : RegisterWithSubRegs<"dx", [DL,DH]>;
def CX : RegisterWithSubRegs<"cx", [CL,CH]>;
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
index 990962d..857becf 100644
--- a/lib/Target/X86/X86Relocations.h
+++ b/lib/Target/X86/X86Relocations.h
@@ -1,4 +1,4 @@
-//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
new file mode 100644
index 0000000..d6d0149
--- /dev/null
+++ b/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,262 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_DEFAULT : InstrItinClass;
+def IIC_ALU_MEM : InstrItinClass;
+def IIC_ALU_NONMEM : InstrItinClass;
+def IIC_LEA : InstrItinClass;
+def IIC_LEA_16 : InstrItinClass;
+def IIC_MUL8 : InstrItinClass;
+def IIC_MUL16_MEM : InstrItinClass;
+def IIC_MUL16_REG : InstrItinClass;
+def IIC_MUL32_MEM : InstrItinClass;
+def IIC_MUL32_REG : InstrItinClass;
+def IIC_MUL64 : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8 : InstrItinClass;
+def IIC_IMUL16_MEM : InstrItinClass;
+def IIC_IMUL16_REG : InstrItinClass;
+def IIC_IMUL32_MEM : InstrItinClass;
+def IIC_IMUL32_REG : InstrItinClass;
+def IIC_IMUL64 : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM : InstrItinClass;
+def IIC_IMUL16_RR : InstrItinClass;
+def IIC_IMUL32_RM : InstrItinClass;
+def IIC_IMUL32_RR : InstrItinClass;
+def IIC_IMUL64_RM : InstrItinClass;
+def IIC_IMUL64_RR : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI : InstrItinClass;
+def IIC_IMUL16_RRI : InstrItinClass;
+def IIC_IMUL32_RMI : InstrItinClass;
+def IIC_IMUL32_RRI : InstrItinClass;
+def IIC_IMUL64_RMI : InstrItinClass;
+def IIC_IMUL64_RRI : InstrItinClass;
+// div
+def IIC_DIV8_MEM : InstrItinClass;
+def IIC_DIV8_REG : InstrItinClass;
+def IIC_DIV16 : InstrItinClass;
+def IIC_DIV32 : InstrItinClass;
+def IIC_DIV64 : InstrItinClass;
+// idiv
+def IIC_IDIV8 : InstrItinClass;
+def IIC_IDIV16 : InstrItinClass;
+def IIC_IDIV32 : InstrItinClass;
+def IIC_IDIV64 : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG : InstrItinClass;
+def IIC_UNARY_MEM : InstrItinClass;
+// add/sub/and/or/xor/adc/sbc/cmp/test
+def IIC_BIN_MEM : InstrItinClass;
+def IIC_BIN_NONMEM : InstrItinClass;
+// shift/rotate
+def IIC_SR : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR : InstrItinClass;
+def IIC_SSE_BIT_P_RM : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR : InstrItinClass;
+def IIC_SSE_INTALU_P_RM : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_CMPP_RR : InstrItinClass;
+def IIC_SSE_CMPP_RM : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTP_RR : InstrItinClass;
+def IIC_SSE_SQRTP_RM : InstrItinClass;
+def IIC_SSE_SQRTS_RR : InstrItinClass;
+def IIC_SSE_SQRTS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+def GenericItineraries : ProcessorItineraries<[], [], []>;
+
+include "X86ScheduleAtom.td"
+
+
+
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 0000000..e8cf72a
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,294 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom (Bonnell)
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+// Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+ // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+ // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+ [ Port0, Port1 ],
+ [], [
+ // P0 only
+ // InstrItinData<class, [InstrStage<N, [P0]>] >,
+ // P0 or P1
+ // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+ // P0 and P1
+ // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >,
+ //
+ // Default is 1 cycle, port0 or port1
+ InstrItinData<IIC_DEFAULT, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+ // mul
+ InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul by al, ax, eax, rax
+ InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg by reg|mem
+ InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+ // imul reg = reg/mem * imm
+ InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+ // idiv
+ InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // div
+ InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+ // neg/not/inc/dec
+ InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+ // add/sub/and/or/xor/adc/sbc/cmp/test
+ InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+ // shift/rotate
+ InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+ // shift double
+ InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+ // cmov
+ InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // set
+ InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+ // jcc
+ InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+ // jcxz/jecxz/jrcxz
+ InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+ // jmp rel
+ InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+ // jmp indirect
+ InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ // jmp far
+ InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+ // loop/loope/loopne
+ InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+ // call - all but reg/imm
+ InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+ InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+ //ret
+ InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+ InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+ //sign extension movs
+ InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+ InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+ //zero extension movs
+ InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+ // SSE binary operations
+ // arithmetic fp scalar
+ InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+ // arithmetic fp parallel
+ InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+ // bitwise parallel
+ InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+ // arithmetic int parallel
+ InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+ // multiply int parallel
+ InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+ // shift parallel
+ InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+ InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+ // conversions
+ // to/from PD ...
+ InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // to/from PS except to/from PD and PS2PI
+ InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >
+]>;
+
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 6406bce..9a04e35 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -65,7 +65,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
std::pair<SDValue,SDValue> CallResult =
TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
false, false, false, false,
- 0, CallingConv::C, false, /*isReturnValueUsed=*/false,
+ 0, CallingConv::C, /*isTailCall=*/false,
+ /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
DAG, dl);
return CallResult.second;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 6e092c7..3eb9441 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -198,7 +198,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
// FIXME: AVX codegen support is not ready.
- //if ((ECX >> 28) & 1) { HasAVX = true; ToggleFeature(X86::FeatureAVX); }
+ //if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); }
bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
@@ -246,6 +246,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
IsBTMemSlow = true;
ToggleFeature(X86::FeatureSlowBTMem);
}
+
// If it's Nehalem, unaligned memory access is fast.
// FIXME: Nehalem is family 6. Also include Westmere and later processors?
if (Family == 15 && Model == 26) {
@@ -253,6 +254,12 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
ToggleFeature(X86::FeatureFastUAMem);
}
+ // Set processor type. Currently only Atom is detected.
+ if (Family == 6 && Model == 28) {
+ X86ProcFamily = IntelAtom;
+ ToggleFeature(X86::FeatureLeaForSP);
+ }
+
unsigned MaxExtLevel;
X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -266,15 +273,19 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasLZCNT = true;
ToggleFeature(X86::FeatureLZCNT);
}
- if (IsAMD && ((ECX >> 6) & 0x1)) {
- HasSSE4A = true;
- ToggleFeature(X86::FeatureSSE4A);
- }
- if (IsAMD && ((ECX >> 16) & 0x1)) {
- HasFMA4 = true;
- ToggleFeature(X86::FeatureFMA4);
- HasXOP = true;
- ToggleFeature(X86::FeatureXOP);
+ if (IsAMD) {
+ if ((ECX >> 6) & 0x1) {
+ HasSSE4A = true;
+ ToggleFeature(X86::FeatureSSE4A);
+ }
+ if ((ECX >> 11) & 0x1) {
+ HasXOP = true;
+ ToggleFeature(X86::FeatureXOP);
+ }
+ if ((ECX >> 16) & 0x1) {
+ HasFMA4 = true;
+ ToggleFeature(X86::FeatureFMA4);
+ }
}
}
}
@@ -291,7 +302,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
}
// FIXME: AVX2 codegen support is not ready.
//if ((EBX >> 5) & 0x1) {
- // HasAVX2 = true;
+ // X86SSELevel = AVX2;
// ToggleFeature(X86::FeatureAVX2);
//}
if ((EBX >> 8) & 0x1) {
@@ -306,6 +317,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
const std::string &FS,
unsigned StackAlignOverride, bool is64Bit)
: X86GenSubtargetInfo(TT, CPU, FS)
+ , X86ProcFamily(Others)
, PICStyle(PICStyles::None)
, X86SSELevel(NoMMXSSE)
, X863DNowLevel(NoThreeDNow)
@@ -313,8 +325,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, HasX86_64(false)
, HasPOPCNT(false)
, HasSSE4A(false)
- , HasAVX(false)
- , HasAVX2(false)
, HasAES(false)
, HasCLMUL(false)
, HasFMA3(false)
@@ -331,16 +341,19 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, IsUAMemFast(false)
, HasVectorUAMem(false)
, HasCmpxchg16b(false)
- , stackAlignment(8)
+ , UseLeaForSP(false)
+ , PostRAScheduler(false)
+ , stackAlignment(4)
// FIXME: this is a known good value for Yonah. How about others?
, MaxInlineSizeThreshold(128)
, TargetTriple(TT)
, In64BitMode(is64Bit) {
// Determine default and user specified characteristics
+ std::string CPUName = CPU;
if (!FS.empty() || !CPU.empty()) {
- std::string CPUName = CPU;
if (CPUName.empty()) {
-#if defined (__x86_64__) || defined(__i386__)
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
CPUName = sys::getHostCPUName();
#else
CPUName = "generic";
@@ -360,6 +373,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
// If feature string is not empty, parse features string.
ParseSubtargetFeatures(CPUName, FullFS);
} else {
+ if (CPUName.empty()) {
+#if defined (__x86_64__) || defined(__i386__)
+ CPUName = sys::getHostCPUName();
+#else
+ CPUName = "generic";
+#endif
+ }
// Otherwise, use CPUID to auto-detect feature set.
AutoDetectSubtargetFeatures();
@@ -368,7 +388,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
HasCMov = true; ToggleFeature(X86::FeatureCMOV);
- if (!HasAVX && X86SSELevel < SSE2) {
+ if (X86SSELevel < SSE2) {
X86SSELevel = SSE2;
ToggleFeature(X86::FeatureSSE1);
ToggleFeature(X86::FeatureSSE2);
@@ -376,14 +396,16 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
}
}
+ if (X86ProcFamily == IntelAtom) {
+ PostRAScheduler = true;
+ InstrItins = getInstrItineraryForCPU(CPUName);
+ }
+
// It's important to keep the MCSubtargetInfo feature bits in sync with
// target data structure which is shared with MC code emitter, etc.
if (In64BitMode)
ToggleFeature(X86::Mode64Bit);
- if (HasAVX)
- X86SSELevel = NoMMXSSE;
-
DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
<< ", 3DNowLevel " << X863DNowLevel
<< ", 64bit " << HasX86_64 << "\n");
@@ -398,3 +420,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
isTargetSolaris() || In64BitMode)
stackAlignment = 16;
}
+
+bool X86Subtarget::enablePostRAScheduler(
+ CodeGenOpt::Level OptLevel,
+ TargetSubtargetInfo::AntiDepBreakMode& Mode,
+ RegClassVector& CriticalPathRCs) const {
+ Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+ CriticalPathRCs.clear();
+ return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index ccb9be0..a36d0d8 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -1,4 +1,4 @@
-//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
//
// The LLVM Compiler Infrastructure
//
@@ -42,13 +42,20 @@ enum Style {
class X86Subtarget : public X86GenSubtargetInfo {
protected:
enum X86SSEEnum {
- NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
};
enum X863DNowEnum {
NoThreeDNow, ThreeDNow, ThreeDNowA
};
+ enum X86ProcFamilyEnum {
+ Others, IntelAtom
+ };
+
+ /// X86ProcFamily - X86 processor family: Intel Atom, and others
+ X86ProcFamilyEnum X86ProcFamily;
+
/// PICStyle - Which PIC style to use
///
PICStyles::Style PICStyle;
@@ -75,12 +82,6 @@ protected:
/// HasSSE4A - True if the processor supports SSE4A instructions.
bool HasSSE4A;
- /// HasAVX - Target has AVX instructions
- bool HasAVX;
-
- /// HasAVX2 - Target has AVX2 instructions
- bool HasAVX2;
-
/// HasAES - Target has AES instructions
bool HasAES;
@@ -131,6 +132,13 @@ protected:
/// this is true for most x86-64 chips, but not the first AMD chips.
bool HasCmpxchg16b;
+ /// UseLeaForSP - True if the LEA instruction should be used for adjusting
+ /// the stack pointer. This is an optimization for Intel Atom processors.
+ bool UseLeaForSP;
+
+ /// PostRAScheduler - True if using post-register-allocation scheduler.
+ bool PostRAScheduler;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
@@ -141,6 +149,9 @@ protected:
/// TargetTriple - What processor and OS we're targeting.
Triple TargetTriple;
+
+ /// Instruction itineraries for scheduling
+ InstrItineraryData InstrItins;
private:
/// In64BitMode - True if compiling for 64-bit, false for 32-bit.
@@ -185,18 +196,12 @@ public:
bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
bool hasSSE41() const { return X86SSELevel >= SSE41; }
bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasAVX() const { return X86SSELevel >= AVX; }
+ bool hasAVX2() const { return X86SSELevel >= AVX2; }
bool hasSSE4A() const { return HasSSE4A; }
bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasPOPCNT() const { return HasPOPCNT; }
- bool hasAVX() const { return HasAVX; }
- bool hasAVX2() const { return HasAVX2; }
- bool hasXMM() const { return hasSSE1() || hasAVX(); }
- bool hasXMMInt() const { return hasSSE2() || hasAVX(); }
- bool hasSSE3orAVX() const { return hasSSE3() || hasAVX(); }
- bool hasSSSE3orAVX() const { return hasSSSE3() || hasAVX(); }
- bool hasSSE41orAVX() const { return hasSSE41() || hasAVX(); }
- bool hasSSE42orAVX() const { return hasSSE42() || hasAVX(); }
bool hasAES() const { return HasAES; }
bool hasCLMUL() const { return HasCLMUL; }
bool hasFMA3() const { return HasFMA3; }
@@ -213,6 +218,9 @@ public:
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+ bool useLeaForSP() const { return UseLeaForSP; }
+
+ bool isAtom() const { return X86ProcFamily == IntelAtom; }
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -226,38 +234,28 @@ public:
// ELF is a reasonably sane default and the only other X86 targets we
// support are Darwin and Windows. Just use "not those".
- bool isTargetELF() const {
- return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing();
- }
+ bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
bool isTargetNaCl() const {
return TargetTriple.getOS() == Triple::NativeClient;
}
bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
-
bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
- bool isTargetCygMing() const {
- return isTargetMingw() || isTargetCygwin();
- }
-
- /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
- bool isTargetCOFF() const {
- return isTargetMingw() || isTargetCygwin() || isTargetWindows();
- }
+ bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+ bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
bool isTargetWin64() const {
// FIXME: x86_64-cygwin has not been released yet.
- return In64BitMode && (isTargetCygMing() || isTargetWindows());
- }
-
- bool isTargetEnvMacho() const {
- return isTargetDarwin() || (TargetTriple.getEnvironment() == Triple::MachO);
+ return In64BitMode && TargetTriple.isOSWindows();
}
bool isTargetWin32() const {
+ // FIXME: Cygwin is included for isTargetWin64 -- should it be included
+ // here too?
return !In64BitMode && (isTargetMingw() || isTargetWindows());
}
@@ -303,6 +301,15 @@ public:
/// indicating the number of scheduling cycles of backscheduling that
/// should be attempted.
unsigned getSpecialAddressLatency() const;
+
+ /// enablePostRAScheduler - run for Atom optimization.
+ bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+ TargetSubtargetInfo::AntiDepBreakMode& Mode,
+ RegClassVector& CriticalPathRCs) const;
+
+ /// getInstrItins = Return the instruction itineraries based on the
+ /// subtarget selection.
+ const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 126042e..f4b7a62 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -28,6 +28,7 @@ extern "C" void LLVMInitializeX86Target() {
RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
}
+void X86_32TargetMachine::anchor() { }
X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS,
@@ -50,6 +51,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
JITInfo(*this) {
}
+void X86_64TargetMachine::anchor() { }
X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS,
@@ -76,7 +78,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
FrameLowering(*this, Subtarget),
- ELFWriterInfo(is64Bit, true) {
+ ELFWriterInfo(is64Bit, true),
+ InstrItins(Subtarget.getInstrItineraryData()){
// Determine the PICStyle based on the target selected.
if (getRelocationModel() == Reloc::Static) {
// Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -99,10 +102,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
// default to hard float ABI
if (Options.FloatABIType == FloatABI::Default)
- this->Options.FloatABIType = FloatABI::Hard;
-
- if (Options.EnableSegmentedStacks && !Subtarget.isTargetELF())
- report_fatal_error("Segmented stacks are only implemented on ELF.");
+ this->Options.FloatABIType = FloatABI::Hard;
}
//===----------------------------------------------------------------------===//
@@ -117,35 +117,61 @@ UseVZeroUpper("x86-use-vzeroupper",
// Pass Pipeline Configuration
//===----------------------------------------------------------------------===//
-bool X86TargetMachine::addInstSelector(PassManagerBase &PM) {
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+ X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ X86TargetMachine &getX86TargetMachine() const {
+ return getTM<X86TargetMachine>();
+ }
+
+ const X86Subtarget &getX86Subtarget() const {
+ return *getX86TargetMachine().getSubtargetImpl();
+ }
+
+ virtual bool addInstSelector();
+ virtual bool addPreRegAlloc();
+ virtual bool addPostRegAlloc();
+ virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new X86PassConfig(this, PM);
+}
+
+bool X86PassConfig::addInstSelector() {
// Install an instruction selector.
- PM.add(createX86ISelDag(*this, getOptLevel()));
+ PM.add(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
// For 32-bit, prepend instructions to set the "global base reg" for PIC.
- if (!Subtarget.is64Bit())
+ if (!getX86Subtarget().is64Bit())
PM.add(createGlobalBaseRegPass());
return false;
}
-bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM) {
+bool X86PassConfig::addPreRegAlloc() {
PM.add(createX86MaxStackAlignmentHeuristicPass());
return false; // -print-machineinstr shouldn't print after this.
}
-bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM) {
+bool X86PassConfig::addPostRegAlloc() {
PM.add(createX86FloatingPointStackifierPass());
return true; // -print-machineinstr should print after this.
}
-bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM) {
+bool X86PassConfig::addPreEmitPass() {
bool ShouldPrint = false;
- if (getOptLevel() != CodeGenOpt::None && Subtarget.hasXMMInt()) {
+ if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
ShouldPrint = true;
}
- if (Subtarget.hasAVX() && UseVZeroUpper) {
+ if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
PM.add(createX86IssueVZeroUpperPass());
ShouldPrint = true;
}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 3ac1769..143caba 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -27,17 +27,18 @@
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
-
+
class formatted_raw_ostream;
class StringRef;
class X86TargetMachine : public LLVMTargetMachine {
- X86Subtarget Subtarget;
- X86FrameLowering FrameLowering;
- X86ELFWriterInfo ELFWriterInfo;
+ X86Subtarget Subtarget;
+ X86FrameLowering FrameLowering;
+ X86ELFWriterInfo ELFWriterInfo;
+ InstrItineraryData InstrItins;
public:
- X86TargetMachine(const Target &T, StringRef TT,
+ X86TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS, const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL,
@@ -56,7 +57,7 @@ public:
virtual const X86TargetLowering *getTargetLowering() const {
llvm_unreachable("getTargetLowering not implemented");
}
- virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
llvm_unreachable("getSelectionDAGInfo not implemented");
}
virtual const X86RegisterInfo *getRegisterInfo() const {
@@ -65,12 +66,13 @@ public:
virtual const X86ELFWriterInfo *getELFWriterInfo() const {
return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
}
+ virtual const InstrItineraryData *getInstrItineraryData() const {
+ return &InstrItins;
+ }
// Set up the pass pipeline.
- virtual bool addInstSelector(PassManagerBase &PM);
- virtual bool addPreRegAlloc(PassManagerBase &PM);
- virtual bool addPostRegAlloc(PassManagerBase &PM);
- virtual bool addPreEmitPass(PassManagerBase &PM);
+ virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
virtual bool addCodeEmitter(PassManagerBase &PM,
JITCodeEmitter &JCE);
};
@@ -78,6 +80,7 @@ public:
/// X86_32TargetMachine - X86 32-bit target machine.
///
class X86_32TargetMachine : public X86TargetMachine {
+ virtual void anchor();
const TargetData DataLayout; // Calculates type size & alignment
X86InstrInfo InstrInfo;
X86SelectionDAGInfo TSInfo;
@@ -92,7 +95,7 @@ public:
virtual const X86TargetLowering *getTargetLowering() const {
return &TLInfo;
}
- virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
return &TSInfo;
}
virtual const X86InstrInfo *getInstrInfo() const {
@@ -106,6 +109,7 @@ public:
/// X86_64TargetMachine - X86 64-bit target machine.
///
class X86_64TargetMachine : public X86TargetMachine {
+ virtual void anchor();
const TargetData DataLayout; // Calculates type size & alignment
X86InstrInfo InstrInfo;
X86SelectionDAGInfo TSInfo;
@@ -120,7 +124,7 @@ public:
virtual const X86TargetLowering *getTargetLowering() const {
return &TLInfo;
}
- virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
return &TSInfo;
}
virtual const X86InstrInfo *getInstrInfo() const {
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 991f322..c0d2a9c 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===//
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index d7adf27..ceb7a4a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===//
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index f8c30eb..2fd78a7 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -145,7 +145,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
// to insert any VZEROUPPER instructions. This is constant-time, so it is
// cheap in the common case of no ymm use.
bool YMMUsed = false;
- TargetRegisterClass *RC = X86::VR256RegisterClass;
+ const TargetRegisterClass *RC = X86::VR256RegisterClass;
for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
i != e; i++) {
if (MRI.isPhysRegUsed(*i)) {