aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp140
-rw-r--r--lib/Target/X86/CMakeLists.txt2
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp53
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h3
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp34
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h71
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp17
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp68
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h1
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp90
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h18
-rw-r--r--lib/Target/X86/X86.h5
-rw-r--r--lib/Target/X86/X86.td52
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp10
-rw-r--r--lib/Target/X86/X86CallingConv.td13
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp786
-rw-r--r--lib/Target/X86/X86FastISel.cpp99
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp2
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp167
-rw-r--r--lib/Target/X86/X86FrameLowering.h2
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp126
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2145
-rw-r--r--lib/Target/X86/X86ISelLowering.h45
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td4
-rw-r--r--lib/Target/X86/X86InstrBuilder.h16
-rw-r--r--lib/Target/X86/X86InstrCompiler.td24
-rw-r--r--lib/Target/X86/X86InstrControl.td48
-rw-r--r--lib/Target/X86/X86InstrFMA.td230
-rw-r--r--lib/Target/X86/X86InstrFPStack.td185
-rw-r--r--lib/Target/X86/X86InstrFormats.td27
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td18
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp912
-rw-r--r--lib/Target/X86/X86InstrInfo.h34
-rw-r--r--lib/Target/X86/X86InstrInfo.td586
-rw-r--r--lib/Target/X86/X86InstrMMX.td403
-rw-r--r--lib/Target/X86/X86InstrSSE.td820
-rw-r--r--lib/Target/X86/X86InstrSystem.td293
-rw-r--r--lib/Target/X86/X86InstrVMX.td8
-rw-r--r--lib/Target/X86/X86InstrXOP.td109
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp48
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h14
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp109
-rw-r--r--lib/Target/X86/X86RegisterInfo.h14
-rw-r--r--lib/Target/X86/X86RegisterInfo.td104
-rw-r--r--lib/Target/X86/X86Schedule.td218
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td229
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp6
-rw-r--r--lib/Target/X86/X86Subtarget.cpp50
-rw-r--r--lib/Target/X86/X86Subtarget.h14
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp23
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp13
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h10
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp4
53 files changed, 5769 insertions, 2753 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 08c732c..95e83ec 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -117,7 +117,7 @@ static unsigned MatchRegisterName(StringRef Name);
/// }
-static bool isImmSExti16i8Value(uint64_t Value) {
+static bool isImmSExti16i8Value(uint64_t Value) {
return (( Value <= 0x000000000000007FULL)||
(0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
(0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
@@ -135,12 +135,12 @@ static bool isImmZExtu32u8Value(uint64_t Value) {
static bool isImmSExti64i8Value(uint64_t Value) {
return (( Value <= 0x000000000000007FULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
}
static bool isImmSExti64i32Value(uint64_t Value) {
return (( Value <= 0x000000007FFFFFFFULL)||
- (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
}
namespace {
@@ -187,7 +187,7 @@ struct X86Operand : public MCParsedAsmOperand {
SMLoc getStartLoc() const { return StartLoc; }
/// getEndLoc - Get the location of the last token of this operand.
SMLoc getEndLoc() const { return EndLoc; }
-
+
SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
virtual void print(raw_ostream &OS) const {}
@@ -309,28 +309,45 @@ struct X86Operand : public MCParsedAsmOperand {
}
bool isMem() const { return Kind == Memory; }
- bool isMem8() const {
+ bool isMem8() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 8);
}
- bool isMem16() const {
+ bool isMem16() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 16);
}
- bool isMem32() const {
+ bool isMem32() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 32);
}
- bool isMem64() const {
+ bool isMem64() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 64);
}
- bool isMem80() const {
+ bool isMem80() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 80);
}
- bool isMem128() const {
+ bool isMem128() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 128);
}
- bool isMem256() const {
+ bool isMem256() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 256);
}
+ bool isMemVX32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVY32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+ bool isMemVX64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+ }
+ bool isMemVY64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+ }
+
bool isAbsMem() const {
return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
!getMemIndexReg() && getMemScale() == 1;
@@ -356,26 +373,38 @@ struct X86Operand : public MCParsedAsmOperand {
addExpr(Inst, getImm());
}
- void addMem8Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMem8Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem16Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem16Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMem64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem32Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMem80Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem64Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMem128Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem80Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMem256Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem128Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMemVX32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
- void addMem256Operands(MCInst &Inst, unsigned N) const {
- addMemOperands(Inst, N);
+ void addMemVY32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVX64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVY64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
}
void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -467,7 +496,7 @@ bool X86AsmParser::isSrcOp(X86Operand &Op) {
bool X86AsmParser::isDstOp(X86Operand &Op) {
unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
- return Op.isMem() &&
+ return Op.isMem() &&
(Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
isa<MCConstantExpr>(Op.Mem.Disp) &&
cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
@@ -611,7 +640,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
if (getLexer().isNot(AsmToken::LBrac))
return ErrorOperand(Start, "Expected '[' token!");
Parser.Lex();
-
+
if (getLexer().is(AsmToken::Identifier)) {
// Parse BaseReg
if (ParseRegister(BaseReg, Start, End)) {
@@ -638,11 +667,11 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
// Handle '[' Scale*IndexReg ']'
Parser.Lex();
SMLoc IdxRegLoc = Parser.getTok().getLoc();
- if (ParseRegister(IndexReg, IdxRegLoc, End))
- return ErrorOperand(IdxRegLoc, "Expected register");
+ if (ParseRegister(IndexReg, IdxRegLoc, End))
+ return ErrorOperand(IdxRegLoc, "Expected register");
Scale = Val;
} else
- return ErrorOperand(Loc, "Unepxeted token");
+ return ErrorOperand(Loc, "Unexpected token");
}
if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
@@ -655,8 +684,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
if (getLexer().is(AsmToken::Star)) {
Parser.Lex();
SMLoc IdxRegLoc = Parser.getTok().getLoc();
- if (ParseRegister(IndexReg, IdxRegLoc, End))
- return ErrorOperand(IdxRegLoc, "Expected register");
+ if (ParseRegister(IndexReg, IdxRegLoc, End))
+ return ErrorOperand(IdxRegLoc, "Expected register");
Scale = Val;
} else if (getLexer().is(AsmToken::RBrac)) {
const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
@@ -668,7 +697,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
End = Parser.getTok().getLoc();
if (!IndexReg)
ParseRegister(IndexReg, Start, End);
- else if (getParser().ParseExpression(Disp, End)) return 0;
+ else if (getParser().ParseExpression(Disp, End)) return 0;
}
}
@@ -881,7 +910,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
if (getParser().ParseAbsoluteExpression(ScaleVal)){
Error(Loc, "expected scale expression");
return 0;
- }
+ }
// Validate the scale amount.
if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
@@ -916,15 +945,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
// If we have both a base register and an index register make sure they are
// both 64-bit or 32-bit registers.
+ // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
if (BaseReg != 0 && IndexReg != 0) {
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
- !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
IndexReg != X86::RIZ) {
Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
return 0;
}
if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
- !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
IndexReg != X86::EIZ){
Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
return 0;
@@ -944,7 +976,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
PatchedName != "setb" && PatchedName != "setnb")
PatchedName = PatchedName.substr(0, Name.size()-1);
-
+
// FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
const MCExpr *ExtraImmOp = 0;
if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
@@ -1204,20 +1236,20 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
// Intel syntax
X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
- delete Operands[2];
- Operands.pop_back();
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[2];
+ Operands.pop_back();
}
} else {
X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
- cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
- delete Operands[1];
- Operands.erase(Operands.begin() + 1);
+ cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+ delete Operands[1];
+ Operands.erase(Operands.begin() + 1);
}
}
}
-
+
// Transforms "int $3" into "int3" as a size optimization. We can't write an
// instalias with an immediate operand yet.
if (Name == "int" && Operands.size() == 2) {
@@ -1520,7 +1552,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
case Match_Success:
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the
- // individual transformations can chain off each other.
+ // individual transformations can chain off each other.
while (processInstruction(Inst, Operands))
;
@@ -1558,12 +1590,12 @@ MatchAndEmitInstruction(SMLoc IDLoc,
// Otherwise, we assume that this may be an integer instruction, which comes
// in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
-
+
// Check for the various suffix matches.
Tmp[Base.size()] = Suffixes[0];
unsigned ErrorInfoIgnore;
unsigned Match1, Match2, Match3, Match4;
-
+
Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
Tmp[Base.size()] = Suffixes[1];
Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
@@ -1673,10 +1705,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
getParser().setAssemblerDialect(1);
if (getLexer().isNot(AsmToken::EndOfStatement)) {
if(Parser.getTok().getString() == "noprefix") {
- // FIXME : Handle noprefix
- Parser.Lex();
+ // FIXME : Handle noprefix
+ Parser.Lex();
} else
- return true;
+ return true;
}
return false;
}
@@ -1691,19 +1723,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
const MCExpr *Value;
if (getParser().ParseExpression(Value))
return true;
-
+
getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
-
+
if (getLexer().is(AsmToken::EndOfStatement))
break;
-
+
// FIXME: Improve diagnostic.
if (getLexer().isNot(AsmToken::Comma))
return Error(L, "unexpected token in directive");
Parser.Lex();
}
}
-
+
Parser.Lex();
return false;
}
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index f612e23..b886d46 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -52,6 +52,8 @@ endif()
add_llvm_target(X86CodeGen ${sources})
+add_dependencies(LLVMX86CodeGen intrinsics_gen)
+
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index b13a006..4bbfe95 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -356,15 +356,15 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
// Special case those X86 instructions that use the imm8 as a set of
// bits, bit count, etc. and are not sign-extend.
if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
- Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
- Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
- Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
- Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
- Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
- Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
- Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
- Opcode != X86::VINSERTPSrr)
- type = TYPE_MOFFS8;
+ Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+ Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+ Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+ Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+ Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+ Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+ Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+ Opcode != X86::VINSERTPSrr)
+ type = TYPE_MOFFS8;
break;
case ENCODING_IW:
type = TYPE_MOFFS16;
@@ -498,7 +498,38 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
} else {
baseReg = MCOperand::CreateReg(0);
}
-
+
+ // Check whether we are handling VSIB addressing mode for GATHER.
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+ // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+ // I don't see a way to get the correct IndexReg in readSIB:
+ // We can tell whether it is VSIB or SIB after instruction ID is decoded,
+ // but instruction ID may not be decoded yet when calling readSIB.
+ uint32_t Opcode = mcInst.getOpcode();
+ bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+ Opcode == X86::VGATHERDPDYrm ||
+ Opcode == X86::VGATHERQPDrm ||
+ Opcode == X86::VGATHERDPSrm ||
+ Opcode == X86::VGATHERQPSrm ||
+ Opcode == X86::VPGATHERDQrm ||
+ Opcode == X86::VPGATHERDQYrm ||
+ Opcode == X86::VPGATHERQQrm ||
+ Opcode == X86::VPGATHERDDrm ||
+ Opcode == X86::VPGATHERQDrm);
+ bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+ Opcode == X86::VGATHERDPSYrm ||
+ Opcode == X86::VGATHERQPSYrm ||
+ Opcode == X86::VPGATHERQQYrm ||
+ Opcode == X86::VPGATHERDDYrm ||
+ Opcode == X86::VPGATHERQDYrm);
+ if (IndexIs128 || IndexIs256) {
+ unsigned IndexOffset = insn.sibIndex -
+ (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+ SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+ insn.sibIndex = (SIBIndex)(IndexBase +
+ (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+ }
+
if (insn.sibIndex != SIB_INDEX_NONE) {
switch (insn.sibIndex) {
default:
@@ -509,6 +540,8 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
indexReg = MCOperand::CreateReg(X86::x); break;
EA_BASES_32BIT
EA_BASES_64BIT
+ REGS_XMM
+ REGS_YMM
#undef ENTRY
}
} else {
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index fae309b..e2caf6a 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -310,11 +310,14 @@ typedef enum {
* SIBIndex - All possible values of the SIB index field.
* Borrows entries from ALL_EA_BASES with the special case that
* sib is synonymous with NONE.
+ * Vector SIB: index can be XMM or YMM.
*/
typedef enum {
SIB_INDEX_NONE,
#define ENTRY(x) SIB_INDEX_##x,
ALL_EA_BASES
+ REGS_XMM
+ REGS_YMM
#undef ENTRY
SIB_INDEX_max
} SIBIndex;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index f532019..64ac5e6 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -96,7 +96,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSHUFHWmi:
case X86::VPSHUFHWmi:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ DecodePSHUFHWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFHWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFHWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFHWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
break;
case X86::PSHUFLWri:
@@ -106,7 +116,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSHUFLWmi:
case X86::VPSHUFLWmi:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ DecodePSHUFLWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFLWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFLWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePSHUFLWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
break;
@@ -487,6 +507,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
+ case X86::VPERMQYri:
+ case X86::VPERMPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMQYmi:
+ case X86::VPERMPDYmi:
+ DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a0bb6dc..db597fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -94,40 +94,83 @@ namespace X86II {
MO_PLT,
/// MO_TLSGD - On a symbol operand this indicates that the immediate is
- /// some TLS offset.
+ /// the offset of the GOT entry with the TLS index structure that contains
+ /// the module number and variable offset for the symbol. Used in the
+ /// general dynamic TLS access model.
///
/// See 'ELF Handling for Thread-Local Storage' for more details.
/// SYMBOL_LABEL @TLSGD
MO_TLSGD,
+ /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to to
+ /// __tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLD
+ MO_TLSLD,
+
+ /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS index for the module that
+ /// contains the symbol. When this index is passed to a call to to
+ /// ___tls_get_addr, the function will return the base address of the TLS
+ /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSLDM
+ MO_TLSLDM,
+
/// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
- /// some TLS offset.
+ /// the offset of the GOT entry with the thread-pointer offset for the
+ /// symbol. Used in the x86-64 initial exec TLS access model.
///
/// See 'ELF Handling for Thread-Local Storage' for more details.
/// SYMBOL_LABEL @GOTTPOFF
MO_GOTTPOFF,
/// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
- /// some TLS offset.
+ /// the absolute address of the GOT entry with the negative thread-pointer
+ /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+ /// model.
///
/// See 'ELF Handling for Thread-Local Storage' for more details.
/// SYMBOL_LABEL @INDNTPOFF
MO_INDNTPOFF,
/// MO_TPOFF - On a symbol operand this indicates that the immediate is
- /// some TLS offset.
+ /// the thread-pointer offset for the symbol. Used in the x86-64 local
+ /// exec TLS access model.
///
/// See 'ELF Handling for Thread-Local Storage' for more details.
/// SYMBOL_LABEL @TPOFF
MO_TPOFF,
+ /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the TLS offset of the symbol. Used
+ /// in the local dynamic TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @DTPOFF
+ MO_DTPOFF,
+
/// MO_NTPOFF - On a symbol operand this indicates that the immediate is
- /// some TLS offset.
+ /// the negative thread-pointer offset for the symbol. Used in the IA32
+ /// local exec TLS access model.
///
/// See 'ELF Handling for Thread-Local Storage' for more details.
/// SYMBOL_LABEL @NTPOFF
MO_NTPOFF,
+ /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+ /// the offset of the GOT entry with the negative thread-pointer offset for
+ /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTNTPOFF
+ MO_GOTNTPOFF,
+
/// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
/// reference is actually to the "__imp_FOO" symbol. This is used for
/// dllimport linkage on windows.
@@ -438,17 +481,17 @@ namespace X86II {
// getBaseOpcodeFor - This function returns the "base" X86 opcode for the
// specified machine instruction.
//
- static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+ inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
return TSFlags >> X86II::OpcodeShift;
}
- static inline bool hasImm(uint64_t TSFlags) {
+ inline bool hasImm(uint64_t TSFlags) {
return (TSFlags & X86II::ImmMask) != 0;
}
/// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
/// of the specified instruction.
- static inline unsigned getSizeOfImm(uint64_t TSFlags) {
+ inline unsigned getSizeOfImm(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8:
@@ -463,7 +506,7 @@ namespace X86II {
/// isImmPCRel - Return true if the immediate of the specified instruction's
/// TSFlags indicates that it is pc relative.
- static inline unsigned isImmPCRel(uint64_t TSFlags) {
+ inline unsigned isImmPCRel(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8PCRel:
@@ -486,9 +529,11 @@ namespace X86II {
/// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
/// counted as one operand.
///
- static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+ inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
switch (TSFlags & X86II::FormMask) {
- case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this form");
+ case X86II::MRMInitReg:
+ // FIXME: Remove this form.
+ return -1;
default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
case X86II::Pseudo:
case X86II::RawFrm:
@@ -546,7 +591,7 @@ namespace X86II {
/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
/// higher) register? e.g. r8, xmm8, xmm13, etc.
- static inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ inline bool isX86_64ExtendedReg(unsigned RegNo) {
switch (RegNo) {
default: break;
case X86::R8: case X86::R9: case X86::R10: case X86::R11:
@@ -568,7 +613,7 @@ namespace X86II {
return false;
}
- static inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+ inline bool isX86_64NonExtLowByteReg(unsigned reg) {
return (reg == X86::SPL || reg == X86::BPL ||
reg == X86::SIL || reg == X86::DIL);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index afa545c..49c07f3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -35,19 +35,6 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
clEnumValEnd));
-static const char *const x86_asm_table[] = {
- "{si}", "S",
- "{di}", "D",
- "{ax}", "a",
- "{cx}", "c",
- "{memory}", "memory",
- "{flags}", "",
- "{dirflag}", "",
- "{fpsr}", "",
- "{fpcr}", "",
- "{cc}", "cc",
- 0,0};
-
void X86MCAsmInfoDarwin::anchor() { }
X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
@@ -55,7 +42,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
if (is64Bit)
PointerSize = 8;
- AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
TextAlignFillValue = 0x90;
@@ -88,7 +74,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
if (T.getArch() == Triple::x86_64)
PointerSize = 8;
- AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
TextAlignFillValue = 0x90;
@@ -137,7 +122,6 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
PrivateGlobalPrefix = ".L";
}
- AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
TextAlignFillValue = 0x90;
@@ -151,7 +135,6 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
PrivateGlobalPrefix = ".L";
}
- AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
TextAlignFillValue = 0x90;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 80990e5..4a38324 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -139,6 +139,7 @@ public:
MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
return new X86MCCodeEmitter(MCII, STI, Ctx);
@@ -569,7 +570,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
}
// Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc.getNumOperands();
unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
switch (TSFlags & X86II::FormMask) {
case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
case X86II::MRMDestMem: {
@@ -602,11 +613,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// FMA4:
// dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
// dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
- if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
VEX_R = 0x0;
if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, 1);
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -616,7 +627,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
VEX_X = 0x0;
if (HasVEX_4VOp3)
- VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ // CurOp points to start of the MemoryOperand,
+ // it skips TIED_TO operands if exist, then increments past src1.
+ // CurOp + X86::AddrNumOperands will point to src3.
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
break;
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
@@ -961,11 +977,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
// FIXME: This should be handled during MCInst lowering.
unsigned NumOps = Desc.getNumOperands();
unsigned CurOp = 0;
- if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
+ if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
++CurOp;
- else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0)
- // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
- --NumOps;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
// Keep track of the current byte being emitted.
unsigned CurByte = 0;
@@ -1037,7 +1056,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SrcRegNum = CurOp + X86::AddrNumOperands;
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
- SrcRegNum++;
+ ++SrcRegNum;
EmitMemModRMByte(MI, CurOp,
GetX86RegNum(MI.getOperand(SrcRegNum)),
@@ -1050,15 +1069,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SrcRegNum = CurOp + 1;
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
- SrcRegNum++;
+ ++SrcRegNum;
- if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
- SrcRegNum++;
+ if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+ ++SrcRegNum;
EmitRegModRMByte(MI.getOperand(SrcRegNum),
GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
- // 2 operands skipped with HasMemOp4, comensate accordingly
+ // 2 operands skipped with HasMemOp4, compensate accordingly
CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
if (HasVEX_4VOp3)
++CurOp;
@@ -1071,7 +1090,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
++AddrOperands;
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
}
- if(HasMemOp4) // Skip second register source (encoded in I8IMM)
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
++FirstMemOp;
EmitByte(BaseOpcode, CurByte, OS);
@@ -1089,7 +1108,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r:
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
- CurOp++;
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
EmitRegModRMByte(MI.getOperand(CurOp++),
(TSFlags & X86II::FormMask)-X86II::MRM0r,
@@ -1100,7 +1119,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM4m: case X86II::MRM5m:
case X86II::MRM6m: case X86II::MRM7m:
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
- CurOp++;
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
TSFlags, CurByte, OS, Fixups);
@@ -1149,22 +1168,23 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
}
// If there is a remaining operand, it must be a trailing immediate. Emit it
- // according to the right size for the instruction.
- if (CurOp != NumOps) {
+ // according to the right size for the instruction. Some instructions
+ // (SSE4a extrq and insertq) have two trailing immediates.
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
// The last source register of a 4 operand instruction in AVX is encoded
// in bits[7:4] of a immediate byte.
if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
- : CurOp);
- CurOp++;
- bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
- unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
- RegNum |= GetX86RegNum(MO) << 4;
+ : CurOp);
+ ++CurOp;
+ unsigned RegNum = GetX86RegNum(MO) << 4;
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ RegNum |= 1 << 7;
// If there is an additional 5th operand it must be an immediate, which
// is encoded in bits[3:0]
- if(CurOp != NumOps) {
+ if (CurOp != NumOps) {
const MCOperand &MIMM = MI.getOperand(CurOp++);
- if(MIMM.isImm()) {
+ if (MIMM.isImm()) {
unsigned Val = MIMM.getImm();
assert(Val < 16 && "Immediate operand value out of range");
RegNum |= Val;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 9896cbe..4650069 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -76,6 +76,7 @@ namespace X86_MC {
}
MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
const MCSubtargetInfo &STI,
MCContext &Ctx);
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index a802333..8b87c1f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -64,13 +64,13 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumLaneElts = NumElts / NumLanes;
- int NewImm = Imm;
+ unsigned NewImm = Imm;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
for (unsigned i = 0; i != NumLaneElts; ++i) {
ShuffleMask.push_back(NewImm % NumLaneElts + l);
@@ -80,48 +80,55 @@ void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- ShuffleMask.push_back(0);
- ShuffleMask.push_back(1);
- ShuffleMask.push_back(2);
- ShuffleMask.push_back(3);
- for (unsigned i = 0; i != 4; ++i) {
- ShuffleMask.push_back(4+(Imm & 3));
- Imm >>= 2;
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
}
}
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
- for (unsigned i = 0; i != 4; ++i) {
- ShuffleMask.push_back((Imm & 3));
- Imm >>= 2;
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
}
- ShuffleMask.push_back(4);
- ShuffleMask.push_back(5);
- ShuffleMask.push_back(6);
- ShuffleMask.push_back(7);
}
/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
/// the type of the vector allowing it to handle different datatypes and vector
/// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumLaneElts = NumElts / NumLanes;
- int NewImm = Imm;
+ unsigned NewImm = Imm;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- // Part that reads from dest.
- for (unsigned i = 0; i != NumLaneElts/2; ++i) {
- ShuffleMask.push_back(NewImm % NumLaneElts + l);
- NewImm /= NumLaneElts;
- }
- // Part that reads from src.
- for (unsigned i = 0; i != NumLaneElts/2; ++i) {
- ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l);
- NewImm /= NumLaneElts;
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
}
if (NumLaneElts == 4) NewImm = Imm; // reload imm
}
@@ -130,7 +137,7 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
/// and punpckh*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -150,7 +157,7 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
/// and punpckl*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -167,19 +174,26 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
if (Imm & 0x88)
return; // Not a shuffle
unsigned HalfSize = VT.getVectorNumElements()/2;
- unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
- unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
- for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i)
- ShuffleMask.push_back(i);
- for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i)
- ShuffleMask.push_back(i);
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+ ShuffleMask.push_back(i);
+ }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != 4; ++i) {
+ ShuffleMask.push_back((Imm >> (2*i)) & 3);
+ }
}
} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 5b8c6ef..70d8171 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -35,31 +35,35 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
// <0,2> or <0,1,4,5>
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
/// the type of the vector allowing it to handle different datatypes and vector
/// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
/// and punpckh*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
/// and punpckl*. VT indicates the type of the vector allowing it to handle
/// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
} // llvm namespace
#endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index ecc7b59..bf05ccf 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -36,6 +36,11 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
/// register for PIC on x86-32.
FunctionPass* createGlobalBaseRegPass();
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
/// createX86FloatingPointStackifierPass - This function returns a pass which
/// converts floating point register references and pseudo instructions into
/// floating point stack references and physical instructions.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index b6591d4..6c1a816 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -86,21 +86,24 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
-def FeatureCLMUL : SubtargetFeature<"clmul", "HasCLMUL", "true",
- "Enable carry-less multiplication instructions">;
-def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true",
+def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+ "Enable packed carry-less multiplication instructions",
+ [FeatureSSE2]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
"Enable three-operand fused multiple-add",
[FeatureAVX]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
"Enable four-operand fused multiple-add",
- [FeatureAVX]>;
+ [FeatureAVX, FeatureSSE4A]>;
def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
- "Enable XOP instructions">;
+ "Enable XOP instructions",
+ [FeatureAVX, FeatureSSE4A]>;
def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
"HasVectorUAMem", "true",
"Allow unaligned memory operands on vector/SIMD instructions">;
def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
- "Enable AES instructions">;
+ "Enable AES instructions",
+ [FeatureSSE2]>;
def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
"Support MOVBE instruction">;
def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true",
@@ -128,10 +131,10 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
"Intel Atom processors">;
class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, GenericItineraries, Features>;
+ : ProcessorModel<Name, GenericModel, Features>;
class AtomProc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, AtomItineraries, Features>;
+ : ProcessorModel<Name, AtomModel, Features>;
def : Proc<"generic", []>;
def : Proc<"i386", []>;
@@ -169,25 +172,23 @@ def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B,
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureSlowBTMem, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeatureCLMUL]>;
+ FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
// Sandy Bridge
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
-// FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
- FeatureAES, FeatureCLMUL]>;
+def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+ FeatureAES, FeaturePCLMUL]>;
// Ivy Bridge
-def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
- FeatureAES, FeatureCLMUL,
+def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+ FeatureAES, FeaturePCLMUL,
FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>;
// Haswell
-// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready.
-def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
- FeatureAES, FeatureCLMUL, FeatureRDRAND,
+def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT,
+ FeatureAES, FeaturePCLMUL, FeatureRDRAND,
FeatureF16C, FeatureFSGSBase,
FeatureMOVBE, FeatureLZCNT, FeatureBMI,
- FeatureBMI2]>;
+ FeatureBMI2, FeatureFMA]>;
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
@@ -211,21 +212,20 @@ def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
-def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A,
+def : Proc<"amdfam10", [FeatureSSE4A,
Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
FeaturePOPCNT, FeatureSlowBTMem]>;
// Bobcat
def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
FeatureLZCNT, FeaturePOPCNT]>;
-// FIXME: Disabling AVX/FMA4 for now since it's not ready.
// Bulldozer
-def : Proc<"bdver1", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
- FeatureAES, FeatureCLMUL,
- FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>;
+def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL,
+ FeatureLZCNT, FeaturePOPCNT]>;
// Enhanced Bulldozer
-def : Proc<"bdver2", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
- FeatureAES, FeatureCLMUL,
- FeatureXOP, FeatureF16C, FeatureLZCNT,
+def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+ FeatureAES, FeaturePCLMUL,
+ FeatureF16C, FeatureLZCNT,
FeaturePOPCNT, FeatureBMI]>;
def : Proc<"winchip-c6", [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 7db7ccb..db71e27 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -20,10 +20,10 @@
#include "X86TargetMachine.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "llvm/CallingConv.h"
+#include "llvm/DebugInfo.h"
#include "llvm/DerivedTypes.h"
#include "llvm/Module.h"
#include "llvm/Type.h"
-#include "llvm/Analysis/DebugInfo.h"
#include "llvm/Assembly/Writer.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@@ -186,10 +186,14 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
O << '-' << *MF->getPICBaseSymbol();
break;
case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_TLSLD: O << "@TLSLD"; break;
+ case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
case X86II::MO_GOT: O << "@GOT"; break;
case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
@@ -403,7 +407,9 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO = MI->getOperand(OpNo);
switch (ExtraCode[0]) {
- default: return true; // Unknown modifier.
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
case 'a': // This is an address. Currently only 'i' and 'r' are expected.
if (MO.isImm()) {
O << MO.getImm();
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index d148989..a6d2709 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -29,10 +29,13 @@ def RetCC_X86Common : CallingConv<[
// up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
// for functions that return two i8 values are currently expected to pack the
// values into an i16 (which uses AX, and thus AL:AH).
- CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
- CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
- CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
- CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+ //
+ // For code that doesn't care about the ABI, we allow returning more than two
+ // integer values in registers.
+ CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
// Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3
// can only be used by ABI non-compliant code. If the target doesn't have XMM
@@ -413,7 +416,7 @@ def CC_X86 : CallingConv<[
// Callee-saved Registers.
//===----------------------------------------------------------------------===//
-def CSR_Ghc : CalleeSavedRegs<(add)>;
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ee3de9a..d705049 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,12 +53,12 @@ namespace {
public:
static char ID;
explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
- : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+ : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
MCE(mce), PICBaseOffset(0), Is64BitMode(false),
IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
Emitter(X86TargetMachine &tm, CodeEmitter &mce,
const X86InstrInfo &ii, const TargetData &td, bool is64)
- : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
+ : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
@@ -68,8 +68,20 @@ namespace {
return "X86 Machine Code Emitter";
}
+ void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const;
+
+ void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const;
+
+ void emitSegmentOverridePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI) const;
+
void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
-
+
void getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
AU.addRequired<MachineModuleInfo>();
@@ -115,17 +127,17 @@ template<class CodeEmitter>
bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
MMI = &getAnalysis<MachineModuleInfo>();
MCE.setModuleInfo(MMI);
-
+
II = TM.getInstrInfo();
TD = TM.getTargetData();
Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-
+
do {
- DEBUG(dbgs() << "JITTing function '"
+ DEBUG(dbgs() << "JITTing function '"
<< MF.getFunction()->getName() << "'\n");
MCE.startFunction(MF);
- for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
MBB != E; ++MBB) {
MCE.StartMachineBasicBlock(MBB);
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
@@ -149,18 +161,18 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
static unsigned determineREX(const MachineInstr &MI) {
unsigned REX = 0;
const MCInstrDesc &Desc = MI.getDesc();
-
+
// Pseudo instructions do not need REX prefix byte.
if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
return 0;
if (Desc.TSFlags & X86II::REX_W)
REX |= 1 << 3;
-
+
unsigned NumOps = Desc.getNumOperands();
if (NumOps) {
bool isTwoAddr = NumOps > 1 &&
- Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
+ Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
// If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
unsigned i = isTwoAddr ? 1 : 0;
for (unsigned e = NumOps; i != e; ++i) {
@@ -171,7 +183,7 @@ static unsigned determineREX(const MachineInstr &MI) {
REX |= 0x40;
}
}
-
+
switch (Desc.TSFlags & X86II::FormMask) {
case X86II::MRMInitReg:
if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
@@ -362,7 +374,7 @@ void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
}
template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
unsigned Index,
unsigned Base) {
// SIB byte is in the same format as the ModRMByte...
@@ -378,8 +390,8 @@ void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
}
}
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit
-/// sign-extended field.
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
static bool isDisp8(int Value) {
return Value == (signed char)Value;
}
@@ -388,10 +400,10 @@ static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
const TargetMachine &TM) {
// For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
// mechanism as 32-bit mode.
- if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
!TM.getSubtarget<X86Subtarget>().isTargetDarwin())
return false;
-
+
// Return true if this is a reference to a stub containing the address of the
// global, not the global itself.
return isGlobalStubReference(GVOp.getTargetFlags());
@@ -417,7 +429,7 @@ void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
if (RelocOp->isGlobal()) {
// In 64-bit static small code model, we could potentially emit absolute.
// But it's probably not beneficial. If the MCE supports using RIP directly
- // do it, otherwise fallback to absolute (this is determined by IsPCRel).
+ // do it, otherwise fallback to absolute (this is determined by IsPCRel).
// 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative
// 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute
bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
@@ -441,7 +453,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
const MachineOperand &Op3 = MI.getOperand(Op+3);
int DispVal = 0;
const MachineOperand *DispForReloc = 0;
-
+
// Figure out what sort of displacement we have to handle here.
if (Op3.isGlobal()) {
DispForReloc = &Op3;
@@ -469,7 +481,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
const MachineOperand &IndexReg = MI.getOperand(Op+2);
unsigned BaseReg = Base.getReg();
-
+
// Handle %rip relative addressing.
if (BaseReg == X86::RIP ||
(Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
@@ -486,7 +498,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
// Is a SIB byte needed?
- // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
// resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
// 2-7) and absolute references.
unsigned BaseRegNo = -1U;
@@ -494,7 +506,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
BaseRegNo = X86_MC::getX86RegNum(BaseReg);
if (// The SIB byte must be used if there is an index register.
- IndexReg.getReg() == 0 &&
+ IndexReg.getReg() == 0 &&
// The SIB byte must be used if the base is ESP/RSP/R12, all of which
// encode to an R/M value of 4, which indicates that a SIB byte is
// present.
@@ -508,7 +520,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
return;
}
-
+
// If the base is not EBP/ESP and there is no displacement, use simple
// indirect register encoding, this handles addresses like [EAX]. The
// encoding for [EBP] with no displacement means [disp32] so we handle it
@@ -517,20 +529,20 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
return;
}
-
+
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
if (!DispForReloc && isDisp8(DispVal)) {
MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
emitConstant(DispVal, 1);
return;
}
-
+
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
return;
}
-
+
// Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
assert(IndexReg.getReg() != X86::ESP &&
IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
@@ -563,7 +575,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
unsigned SS = SSTable[Scale.getImm()];
if (BaseReg == 0) {
- // Handle the SIB byte for the case where there is no base, see Intel
+ // Handle the SIB byte for the case where there is no base, see Intel
// Manual 2A, table 2-7. The displacement has already been output.
unsigned IndexRegNo;
if (IndexReg.getReg())
@@ -596,94 +608,116 @@ static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
return Desc;
}
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
- const MCInstrDesc *Desc) {
- DEBUG(dbgs() << MI);
-
- // If this is a pseudo instruction, lower it.
- switch (Desc->getOpcode()) {
- case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break;
- case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break;
- case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break;
- case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break;
- case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break;
- case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break;
- case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break;
- case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break;
- case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break;
- case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break;
- case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
- case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
- case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
- case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break;
- case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
- case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
- case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
- }
-
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
- MCE.processDebugLoc(MI.getDebugLoc(), true);
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
- unsigned Opcode = Desc->Opcode;
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+ const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
+ const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+#endif
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const {
// Emit the lock opcode prefix as needed.
if (Desc->TSFlags & X86II::LOCK)
MCE.emitByte(0xF0);
// Emit segment override opcode prefix as needed.
- switch (Desc->TSFlags & X86II::SegOvrMask) {
- case X86II::FS:
- MCE.emitByte(0x64);
- break;
- case X86II::GS:
- MCE.emitByte(0x65);
- break;
- default: llvm_unreachable("Invalid segment!");
- case 0: break; // No segment override!
- }
+ emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
// Emit the repeat opcode prefix as needed.
if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
MCE.emitByte(0xF3);
- // Emit the operand size opcode prefix as needed.
- if (Desc->TSFlags & X86II::OpSize)
- MCE.emitByte(0x66);
-
// Emit the address size opcode prefix as needed.
- if (Desc->TSFlags & X86II::AdSize)
+ bool need_address_override;
+ if (TSFlags & X86II::AdSize) {
+ need_address_override = true;
+ } else if (MemOperand == -1) {
+ need_address_override = false;
+ } else if (Is64BitMode) {
+ assert(!Is16BitMemOperand(MI, MemOperand));
+ need_address_override = Is32BitMemOperand(MI, MemOperand);
+ } else {
+ assert(!Is64BitMemOperand(MI, MemOperand));
+ need_address_override = Is16BitMemOperand(MI, MemOperand);
+ }
+
+ if (need_address_override)
MCE.emitByte(0x67);
+ // Emit the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ MCE.emitByte(0x66);
+
bool Need0FPrefix = false;
switch (Desc->TSFlags & X86II::Op0Mask) {
- case X86II::TB: // Two-byte opcode prefix
- case X86II::T8: // 0F 38
- case X86II::TA: // 0F 3A
- case X86II::A6: // 0F A6
- case X86II::A7: // 0F A7
- Need0FPrefix = true;
- break;
- case X86II::REP: break; // already handled.
- case X86II::T8XS: // F3 0F 38
- case X86II::XS: // F3 0F
- MCE.emitByte(0xF3);
- Need0FPrefix = true;
- break;
- case X86II::T8XD: // F2 0F 38
- case X86II::TAXD: // F2 0F 3A
- case X86II::XD: // F2 0F
- MCE.emitByte(0xF2);
- Need0FPrefix = true;
- break;
- case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
- case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
- MCE.emitByte(0xD8+
- (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
- >> X86II::Op0Shift));
- break; // Two-byte opcode prefix
- default: llvm_unreachable("Invalid prefix!");
- case 0: break; // No prefix!
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::A6: // 0F A6
+ case X86II::A7: // 0F A7
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::T8XS: // F3 0F 38
+ case X86II::XS: // F3 0F
+ MCE.emitByte(0xF3);
+ Need0FPrefix = true;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ case X86II::TAXD: // F2 0F 3A
+ case X86II::XD: // F2 0F
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ MCE.emitByte(0xD8+
+ (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+ >> X86II::Op0Shift));
+ break; // Two-byte opcode prefix
+ default: llvm_unreachable("Invalid prefix!");
+ case 0: break; // No prefix!
}
// Handle REX prefix.
@@ -697,50 +731,446 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
MCE.emitByte(0x0F);
switch (Desc->TSFlags & X86II::Op0Mask) {
- case X86II::T8XD: // F2 0F 38
- case X86II::T8XS: // F3 0F 38
- case X86II::T8: // 0F 38
- MCE.emitByte(0x38);
- break;
- case X86II::TAXD: // F2 0F 38
- case X86II::TA: // 0F 3A
- MCE.emitByte(0x3A);
- break;
- case X86II::A6: // 0F A6
- MCE.emitByte(0xA6);
- break;
- case X86II::A7: // 0F A7
- MCE.emitByte(0xA7);
- break;
+ case X86II::T8XD: // F2 0F 38
+ case X86II::T8XS: // F3 0F 38
+ case X86II::T8: // 0F 38
+ MCE.emitByte(0x38);
+ break;
+ case X86II::TAXD: // F2 0F 38
+ case X86II::TA: // 0F 3A
+ MCE.emitByte(0x3A);
+ break;
+ case X86II::A6: // 0F A6
+ MCE.emitByte(0xA6);
+ break;
+ case X86II::A7: // 0F A7
+ MCE.emitByte(0xA7);
+ break;
+ }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+// ModRM field => XMM9 => 1
+// VEX.VVVV => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+static unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+ unsigned OpNum) {
+ unsigned SrcReg = MI.getOperand(OpNum).getReg();
+ unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg());
+ if (X86II::isX86_64ExtendedReg(SrcReg))
+ SrcRegNum |= 8;
+
+ // The registers represented through VEX_VVVV should
+ // be encoded in 1's complement form.
+ return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI) const {
+ switch (TSFlags & X86II::SegOvrMask) {
+ default: llvm_unreachable("Invalid segment!");
+ case 0:
+ // No segment override, check for explicit one on memory operand.
+ if (MemOperand != -1) { // If the instruction has a memory operand.
+ switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+ default: llvm_unreachable("Unknown segment register!");
+ case 0: break;
+ case X86::CS: MCE.emitByte(0x2E); break;
+ case X86::SS: MCE.emitByte(0x36); break;
+ case X86::DS: MCE.emitByte(0x3E); break;
+ case X86::ES: MCE.emitByte(0x26); break;
+ case X86::FS: MCE.emitByte(0x64); break;
+ case X86::GS: MCE.emitByte(0x65); break;
+ }
+ }
+ break;
+ case X86II::FS:
+ MCE.emitByte(0x64);
+ break;
+ case X86II::GS:
+ MCE.emitByte(0x65);
+ break;
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+ int MemOperand,
+ const MachineInstr &MI,
+ const MCInstrDesc *Desc) const {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+
+ // VEX_R: opcode externsion equivalent to REX.R in
+ // 1's complement (inverted) form
+ //
+ // 1: Same as REX_R=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX_R=1 (64 bit mode only)
+ //
+ unsigned char VEX_R = 0x1;
+
+ // VEX_X: equivalent to REX.X, only used when a
+ // register is used for index in SIB Byte.
+ //
+ // 1: Same as REX.X=0 (must be 1 in 32-bit mode)
+ // 0: Same as REX.X=1 (64-bit mode only)
+ unsigned char VEX_X = 0x1;
+
+ // VEX_B:
+ //
+ // 1: Same as REX_B=0 (ignored in 32-bit mode)
+ // 0: Same as REX_B=1 (64 bit mode only)
+ //
+ unsigned char VEX_B = 0x1;
+
+ // VEX_W: opcode specific (use like REX.W, or used for
+ // opcode extension, or ignored, depending on the opcode byte)
+ unsigned char VEX_W = 0;
+
+ // XOP: Use XOP prefix byte 0x8f instead of VEX.
+ unsigned char XOP = 0;
+
+ // VEX_5M (VEX m-mmmmm field):
+ //
+ // 0b00000: Reserved for future use
+ // 0b00001: implied 0F leading opcode
+ // 0b00010: implied 0F 38 leading opcode bytes
+ // 0b00011: implied 0F 3A leading opcode bytes
+ // 0b00100-0b11111: Reserved for future use
+ // 0b01000: XOP map select - 08h instructions with imm byte
+ // 0b10001: XOP map select - 09h instructions with no imm byte
+ unsigned char VEX_5M = 0x1;
+
+ // VEX_4V (VEX vvvv field): a register specifier
+ // (in 1's complement form) or 1111 if unused.
+ unsigned char VEX_4V = 0xf;
+
+ // VEX_L (Vector Length):
+ //
+ // 0: scalar or 128-bit vector
+ // 1: 256-bit vector
+ //
+ unsigned char VEX_L = 0;
+
+ // VEX_PP: opcode extension providing equivalent
+ // functionality of a SIMD prefix
+ //
+ // 0b00: None
+ // 0b01: 66
+ // 0b10: F3
+ // 0b11: F2
+ //
+ unsigned char VEX_PP = 0;
+
+ // Encode the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ VEX_PP = 0x01;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+ VEX_W = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+ XOP = 1;
+
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+ VEX_L = 1;
+
+ switch (TSFlags & X86II::Op0Mask) {
+ default: llvm_unreachable("Invalid prefix!");
+ case X86II::T8: // 0F 38
+ VEX_5M = 0x2;
+ break;
+ case X86II::TA: // 0F 3A
+ VEX_5M = 0x3;
+ break;
+ case X86II::T8XS: // F3 0F 38
+ VEX_PP = 0x2;
+ VEX_5M = 0x2;
+ break;
+ case X86II::T8XD: // F2 0F 38
+ VEX_PP = 0x3;
+ VEX_5M = 0x2;
+ break;
+ case X86II::TAXD: // F2 0F 3A
+ VEX_PP = 0x3;
+ VEX_5M = 0x3;
+ break;
+ case X86II::XS: // F3 0F
+ VEX_PP = 0x2;
+ break;
+ case X86II::XD: // F2 0F
+ VEX_PP = 0x3;
+ break;
+ case X86II::XOP8:
+ VEX_5M = 0x8;
+ break;
+ case X86II::XOP9:
+ VEX_5M = 0x9;
+ break;
+ case X86II::A6: // Bypass: Not used by VEX
+ case X86II::A7: // Bypass: Not used by VEX
+ case X86II::TB: // Bypass: Not used by VEX
+ case 0:
+ break; // No prefix!
+ }
+
+
+ // Set the vector length to 256-bit if YMM0-YMM15 is used
+ for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+ if (!MI.getOperand(i).isReg())
+ continue;
+ if (MI.getOperand(i).isImplicit())
+ continue;
+ unsigned SrcReg = MI.getOperand(i).getReg();
+ if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
+ VEX_L = 1;
+ }
+
+ // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+ ++CurOp;
+ else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ // Duplicate register.
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ break;
+ case X86II::MRMDestMem: {
+ // MRMDestMem instructions forms:
+ // MemAddr, src1(ModR/M)
+ // MemAddr, src1(VEX_4V), src2(ModR/M)
+ // MemAddr, src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ CurOp = X86::AddrNumOperands;
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+ const MachineOperand &MO = MI.getOperand(CurOp);
+ if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ break;
+ }
+ case X86II::MRMSrcMem:
+ // MRMSrcMem instructions forms:
+ // src1(ModR/M), MemAddr
+ // src1(ModR/M), src2(VEX_4V), MemAddr
+ // src1(ModR/M), MemAddr, imm8
+ // src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+ //
+ // FMA4:
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ VEX_R = 0x0;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, 1);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ // MRM[0-9]m instructions forms:
+ // MemAddr
+ // src1(VEX_4V), MemAddr
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(
+ MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+ VEX_X = 0x0;
+ break;
+ }
+ case X86II::MRMSrcReg:
+ // MRMSrcReg instructions forms:
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M), src1(ModR/M)
+ // dst(ModR/M), src1(ModR/M), imm8
+ //
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_R = 0x0;
+ CurOp++;
+
+ if (HasVEX_4V)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_B = 0x0;
+ CurOp++;
+ if (HasVEX_4VOp3)
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ break;
+ case X86II::MRMDestReg:
+ // MRMDestReg instructions forms:
+ // dst(ModR/M), src(ModR/M)
+ // dst(ModR/M), src(ModR/M), imm8
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ VEX_B = 0x0;
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_R = 0x0;
+ break;
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ // MRM0r-MRM7r instructions forms:
+ // dst(VEX_4V), src(ModR/M), imm8
+ VEX_4V = getVEXRegisterEncoding(MI, 0);
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_B = 0x0;
+ break;
+ default: // RawFrm
+ break;
+ }
+
+ // Emit segment override opcode prefix as needed.
+ emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+ if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+ MCE.emitByte(0xC5);
+ MCE.emitByte(LastByte | (VEX_R << 7));
+ return;
+ }
+
+ // 3 byte VEX prefix
+ MCE.emitByte(XOP ? 0x8F : 0xC4);
+ MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+ MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+ const MCInstrDesc *Desc) {
+ DEBUG(dbgs() << MI);
+
+ // If this is a pseudo instruction, lower it.
+ switch (Desc->getOpcode()) {
+ case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break;
+ case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break;
+ case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break;
+ case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break;
+ case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break;
+ case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+ case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+ case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+ case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+ case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+ case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+ case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+ case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+ case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+ case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+ case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+ case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
}
+
+ MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+ unsigned Opcode = Desc->Opcode;
+
// If this is a two-address instruction, skip one of the register operands.
unsigned NumOps = Desc->getNumOperands();
unsigned CurOp = 0;
- if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1)
+ if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
++CurOp;
- else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1,MCOI::TIED_TO)== 0)
- // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
- --NumOps;
+ else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+ assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ // Special case for GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ }
+
+ uint64_t TSFlags = Desc->TSFlags;
+
+ // Is this instruction encoded using the AVX VEX prefix?
+ bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+ // It uses the VEX.VVVV field?
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ const unsigned MemOp4_I8IMMOperand = 2;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+ if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+ if (!HasVEXPrefix)
+ emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+ else
+ emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
- switch (Desc->TSFlags & X86II::FormMask) {
+ switch (TSFlags & X86II::FormMask) {
default:
llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
case X86II::Pseudo:
// Remember the current PC offset, this is the PIC relocation
// base address.
switch (Opcode) {
- default:
+ default:
llvm_unreachable("pseudo instructions should be removed before code"
" emission");
- break;
// Do nothing for Int_MemBarrier - it's just a comment. Add a debug
// to make it slightly easier to see.
case X86::Int_MemBarrier:
DEBUG(dbgs() << "#MEMBARRIER\n");
break;
-
+
case TargetOpcode::INLINEASM:
// We allow inline assembler nodes with empty bodies - they can
// implicitly define registers, which is ok for JIT.
@@ -752,7 +1182,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
case TargetOpcode::EH_LABEL:
MCE.emitLabel(MI.getOperand(0).getMCSymbol());
break;
-
+
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
break;
@@ -774,7 +1204,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
if (CurOp == NumOps)
break;
-
+
const MachineOperand &MO = MI.getOperand(CurOp++);
DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
@@ -787,13 +1217,13 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
emitPCRelativeBlockAddress(MO.getMBB());
break;
}
-
+
if (MO.isGlobal()) {
emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
MO.getOffset(), 0);
break;
}
-
+
if (MO.isSymbol()) {
emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
break;
@@ -804,7 +1234,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
break;
}
-
+
assert(MO.isImm() && "Unknown RawFrm operand!");
if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
// Fix up immediate operand for pc relative calls.
@@ -815,21 +1245,21 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
break;
}
-
+
case X86II::AddRegFrm: {
MCE.emitByte(BaseOpcode +
X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
-
+
if (CurOp == NumOps)
break;
-
+
const MachineOperand &MO1 = MI.getOperand(CurOp++);
unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
if (MO1.isImm()) {
emitConstant(MO1.getImm(), Size);
break;
}
-
+
unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
: (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
if (Opcode == X86::MOV64ri64i32)
@@ -855,46 +1285,57 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
emitRegModRMByte(MI.getOperand(CurOp).getReg(),
X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
CurOp += 2;
- if (CurOp != NumOps)
- emitConstant(MI.getOperand(CurOp++).getImm(),
- X86II::getSizeOfImm(Desc->TSFlags));
break;
}
case X86II::MRMDestMem: {
MCE.emitByte(BaseOpcode);
+
+ unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ SrcRegNum++;
emitMemModRMByte(MI, CurOp,
- X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
- .getReg()));
- CurOp += X86::AddrNumOperands + 1;
- if (CurOp != NumOps)
- emitConstant(MI.getOperand(CurOp++).getImm(),
- X86II::getSizeOfImm(Desc->TSFlags));
+ X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+ CurOp = SrcRegNum + 1;
break;
}
- case X86II::MRMSrcReg:
+ case X86II::MRMSrcReg: {
MCE.emitByte(BaseOpcode);
- emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+
+ unsigned SrcRegNum = CurOp+1;
+ if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+ ++SrcRegNum;
+
+ emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
- CurOp += 2;
- if (CurOp != NumOps)
- emitConstant(MI.getOperand(CurOp++).getImm(),
- X86II::getSizeOfImm(Desc->TSFlags));
+ // 2 operands skipped with HasMemOp4, compensate accordingly
+ CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+ if (HasVEX_4VOp3)
+ ++CurOp;
break;
-
+ }
case X86II::MRMSrcMem: {
int AddrOperands = X86::AddrNumOperands;
+ unsigned FirstMemOp = CurOp+1;
+ if (HasVEX_4V) {
+ ++AddrOperands;
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ }
+ if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+ ++FirstMemOp;
+
+ MCE.emitByte(BaseOpcode);
intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
X86II::getSizeOfImm(Desc->TSFlags) : 0;
-
- MCE.emitByte(BaseOpcode);
- emitMemModRMByte(MI, CurOp+1,
+ emitMemModRMByte(MI, FirstMemOp,
X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
CurOp += AddrOperands + 1;
- if (CurOp != NumOps)
- emitConstant(MI.getOperand(CurOp++).getImm(),
- X86II::getSizeOfImm(Desc->TSFlags));
+ if (HasVEX_4VOp3)
+ ++CurOp;
break;
}
@@ -902,20 +1343,22 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
MCE.emitByte(BaseOpcode);
emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
(Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
if (CurOp == NumOps)
break;
-
+
const MachineOperand &MO1 = MI.getOperand(CurOp++);
unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
if (MO1.isImm()) {
emitConstant(MO1.getImm(), Size);
break;
}
-
+
unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
: (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
if (Opcode == X86::MOV64ri32)
@@ -937,8 +1380,10 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
case X86II::MRM2m: case X86II::MRM3m:
case X86II::MRM4m: case X86II::MRM5m:
case X86II::MRM6m: case X86II::MRM7m: {
+ if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+ ++CurOp;
intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
- (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
+ (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
MCE.emitByte(BaseOpcode);
@@ -948,14 +1393,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
if (CurOp == NumOps)
break;
-
+
const MachineOperand &MO = MI.getOperand(CurOp++);
unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
if (MO.isImm()) {
emitConstant(MO.getImm(), Size);
break;
}
-
+
unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
: (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
if (Opcode == X86::MOV64mi32)
@@ -980,7 +1425,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
++CurOp;
break;
-
+
case X86II::MRM_C1:
MCE.emitByte(BaseOpcode);
MCE.emitByte(0xC1);
@@ -1003,6 +1448,33 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
break;
}
+ while (CurOp != NumOps && NumOps - CurOp <= 2) {
+ // The last source register of a 4 operand instruction in AVX is encoded
+ // in bits[7:4] of a immediate byte.
+ if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+ const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+ : CurOp);
+ ++CurOp;
+ unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4;
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ RegNum |= 1 << 7;
+ // If there is an additional 5th operand it must be an immediate, which
+ // is encoded in bits[3:0]
+ if (CurOp != NumOps) {
+ const MachineOperand &MIMM = MI.getOperand(CurOp++);
+ if (MIMM.isImm()) {
+ unsigned Val = MIMM.getImm();
+ assert(Val < 16 && "Immediate operand value out of range");
+ RegNum |= Val;
+ }
+ }
+ emitConstant(RegNum, 1);
+ } else {
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ }
+
if (!MI.isVariadic() && CurOp != NumOps) {
#ifndef NDEBUG
dbgs() << "Cannot encode all operands of: " << MI << "\n";
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 69752c5..585b7a5 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -183,37 +183,37 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
case MVT::i1:
case MVT::i8:
Opc = X86::MOV8rm;
- RC = X86::GR8RegisterClass;
+ RC = &X86::GR8RegClass;
break;
case MVT::i16:
Opc = X86::MOV16rm;
- RC = X86::GR16RegisterClass;
+ RC = &X86::GR16RegClass;
break;
case MVT::i32:
Opc = X86::MOV32rm;
- RC = X86::GR32RegisterClass;
+ RC = &X86::GR32RegClass;
break;
case MVT::i64:
// Must be in x86-64 mode.
Opc = X86::MOV64rm;
- RC = X86::GR64RegisterClass;
+ RC = &X86::GR64RegClass;
break;
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = X86::FR32RegisterClass;
+ RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
- RC = X86::RFP32RegisterClass;
+ RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = X86::FR64RegisterClass;
+ RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
- RC = X86::RFP64RegisterClass;
+ RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
@@ -240,7 +240,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
default: return false;
case MVT::i1: {
// Mask out all but lowest bit.
- unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+ unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
Val = AndResult;
@@ -547,13 +547,13 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
if (TLI.getPointerTy() == MVT::i64) {
Opc = X86::MOV64rm;
- RC = X86::GR64RegisterClass;
+ RC = &X86::GR64RegClass;
if (Subtarget->isPICStyleRIPRel())
StubAM.Base.Reg = X86::RIP;
} else {
Opc = X86::MOV32rm;
- RC = X86::GR32RegisterClass;
+ RC = &X86::GR32RegClass;
}
LoadReg = createResultReg(RC);
@@ -743,7 +743,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ValLocs;
CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
- I->getContext());
+ I->getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
const Value *RV = Ret->getOperand(0);
@@ -1258,7 +1258,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
if (V->getType()->isFloatTy()) {
unsigned OpReg = getRegForValue(V);
if (OpReg == 0) return false;
- unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+ unsigned ResultReg = createResultReg(&X86::FR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
TII.get(X86::CVTSS2SDrr), ResultReg)
.addReg(OpReg);
@@ -1277,7 +1277,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
if (V->getType()->isDoubleTy()) {
unsigned OpReg = getRegForValue(V);
if (OpReg == 0) return false;
- unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+ unsigned ResultReg = createResultReg(&X86::FR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
TII.get(X86::CVTSD2SSrr), ResultReg)
.addReg(OpReg);
@@ -1314,8 +1314,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
if (!Subtarget->is64Bit()) {
// If we're on x86-32; we can't extract an i8 from a general register.
// First issue a copy to GR16_ABCD or GR32_ABCD.
- const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
- ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+ const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+ (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+ (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
unsigned CopyReg = createResultReg(CopyRC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
CopyReg).addReg(InputReg);
@@ -1423,7 +1424,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
return DoSelectCall(&I, "memset");
}
case Intrinsic::stackprotector: {
- // Emit code inline code to store the stack guard onto the stack.
+ // Emit code to store the stack guard onto the stack.
EVT PtrTy = TLI.getPointerTy();
const Value *Op1 = I.getArgOperand(0); // The guard's value.
@@ -1484,7 +1485,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
return false;
// The call to CreateRegs builds two sequential registers, to store the
- // both the the returned values.
+ // both the returned values.
unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
.addReg(Reg1).addReg(Reg2);
@@ -1548,12 +1549,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Check whether the function can return without sret-demotion.
SmallVector<ISD::OutputArg, 4> Outs;
- SmallVector<uint64_t, 4> Offsets;
GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
- Outs, TLI, &Offsets);
+ Outs, TLI);
bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
- *FuncInfo.MF, FTy->isVarArg(),
- Outs, FTy->getContext());
+ *FuncInfo.MF, FTy->isVarArg(),
+ Outs, FTy->getContext());
if (!CanLowerReturn)
return false;
@@ -1667,7 +1667,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
- I->getParent()->getContext());
+ I->getParent()->getContext());
// Allocate shadow area for Win64
if (Subtarget->isTargetWin64())
@@ -1693,7 +1693,6 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Promote the value if needed.
switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt: {
assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
@@ -1737,6 +1736,14 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
ArgVT = VA.getLocVT();
break;
}
+ case CCValAssign::VExt:
+ // VExt has not been implemented, so this should be impossible to reach
+ // for now. However, fallback to Selection DAG isel once implemented.
+ return false;
+ case CCValAssign::Indirect:
+ // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+ // support this.
+ return false;
}
if (VA.isRegLoc()) {
@@ -1838,25 +1845,27 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
MIB.addGlobalAddress(GV, 0, OpFlags);
}
+ // Add a register mask with the call-preserved registers.
+ // Proper defs for return values will be added by setPhysRegsDeadExcept().
+ MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
// Add an implicit use GOT pointer in EBX.
if (Subtarget->isPICStyleGOT())
- MIB.addReg(X86::EBX);
+ MIB.addReg(X86::EBX, RegState::Implicit);
if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
- MIB.addReg(X86::AL);
+ MIB.addReg(X86::AL, RegState::Implicit);
// Add implicit physical register uses to the call.
for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
- MIB.addReg(RegArgs[i]);
-
- // Add a register mask with the call-preserved registers.
- // Proper defs for return values will be added by setPhysRegsDeadExcept().
- MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+ MIB.addReg(RegArgs[i], RegState::Implicit);
// Issue CALLSEQ_END
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
unsigned NumBytesCallee = 0;
if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() &&
+ !(CS.getCallingConv() == CallingConv::Fast ||
+ CS.getCallingConv() == CallingConv::GHC) &&
CS.paramHasAttr(1, Attribute::StructRet))
NumBytesCallee = 4;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
@@ -1889,7 +1898,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
SmallVector<unsigned, 4> UsedRegs;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
- I->getParent()->getContext());
+ I->getParent()->getContext());
unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1903,7 +1912,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
RVLocs[i].getLocReg() == X86::ST1)) {
if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
CopyVT = MVT::f80;
- CopyReg = createResultReg(X86::RFP80RegisterClass);
+ CopyReg = createResultReg(&X86::RFP80RegClass);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
CopyReg);
@@ -2001,37 +2010,37 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
default: return false;
case MVT::i8:
Opc = X86::MOV8rm;
- RC = X86::GR8RegisterClass;
+ RC = &X86::GR8RegClass;
break;
case MVT::i16:
Opc = X86::MOV16rm;
- RC = X86::GR16RegisterClass;
+ RC = &X86::GR16RegClass;
break;
case MVT::i32:
Opc = X86::MOV32rm;
- RC = X86::GR32RegisterClass;
+ RC = &X86::GR32RegClass;
break;
case MVT::i64:
// Must be in x86-64 mode.
Opc = X86::MOV64rm;
- RC = X86::GR64RegisterClass;
+ RC = &X86::GR64RegClass;
break;
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = X86::FR32RegisterClass;
+ RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
- RC = X86::RFP32RegisterClass;
+ RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = X86::FR64RegisterClass;
+ RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
- RC = X86::RFP64RegisterClass;
+ RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
@@ -2124,19 +2133,19 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = X86::FsFLD0SS;
- RC = X86::FR32RegisterClass;
+ RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp032;
- RC = X86::RFP32RegisterClass;
+ RC = &X86::RFP32RegClass;
}
break;
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = X86::FsFLD0SD;
- RC = X86::FR64RegisterClass;
+ RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp064;
- RC = X86::RFP64RegisterClass;
+ RC = &X86::RFP64RegClass;
}
break;
case MVT::f80:
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index ed1707d..711ee41 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -130,7 +130,7 @@ namespace {
// The hardware keeps track of how many FP registers are live, so we have
// to model that exactly. Usually, each live register corresponds to an
// FP<n> register, but when dealing with calls, returns, and inline
- // assembly, it is sometimes neccesary to have live scratch registers.
+ // assembly, it is sometimes necessary to have live scratch registers.
unsigned Stack[8]; // FP<n> Registers in each stack slot...
unsigned StackTop; // The current top of the FP stack.
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 000e375..2238688 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -45,14 +45,14 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineModuleInfo &MMI = MF.getMMI();
- const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
- RI->needsStackRealignment(MF) ||
+ RegInfo->needsStackRealignment(MF) ||
MFI->hasVarSizedObjects() ||
MFI->isFrameAddressTaken() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
- MMI.callsUnwindInit());
+ MMI.callsUnwindInit() || MMI.callsEHReturn());
}
static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
@@ -125,8 +125,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
unsigned Reg = MO.getReg();
if (!Reg)
continue;
- for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI)
- Uses.insert(*AsI);
+ for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
}
const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
@@ -369,7 +369,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
/// getCompactUnwindRegNum - Get the compact unwind number for a given
/// register. The number corresponds to the enum lists in
/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
if (*CURegs == Reg)
return Idx;
@@ -398,13 +398,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
// 4 3
// 5 3
//
- static const unsigned CU32BitRegs[] = {
+ static const uint16_t CU32BitRegs[] = {
X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
};
- static const unsigned CU64BitRegs[] = {
+ static const uint16_t CU64BitRegs[] = {
X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
};
- const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+ const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
@@ -466,13 +466,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
static uint32_t
encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
bool Is64Bit) {
- static const unsigned CU32BitRegs[] = {
+ static const uint16_t CU32BitRegs[] = {
X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
};
- static const unsigned CU64BitRegs[] = {
+ static const uint16_t CU64BitRegs[] = {
X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
};
- const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+ const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
// Encode the registers in the order they were saved, 3-bits per register. The
// registers are numbered from 1 to CU_NUM_SAVED_REGS.
@@ -650,6 +650,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
unsigned SlotSize = RegInfo->getSlotSize();
unsigned FramePtr = RegInfo->getFrameRegister(MF);
unsigned StackPtr = RegInfo->getStackRegister();
+ unsigned BasePtr = RegInfo->getBaseRegister();
DebugLoc DL;
// If we're forcing a stack realignment we can't rely on just the frame
@@ -721,10 +722,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
if (HasFP) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
- if (RegInfo->needsStackRealignment(MF))
- FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
-
- NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+ if (RegInfo->needsStackRealignment(MF)) {
+ // Callee-saved registers are pushed on stack before the stack
+ // is realigned.
+ FrameSize -= X86FI->getCalleeSavedFrameSize();
+ NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+ } else {
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+ }
// Get the offset of the stack slot for the EBP register, which is
// guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -781,19 +786,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
I != E; ++I)
I->addLiveIn(FramePtr);
-
- // Realign stack
- if (RegInfo->needsStackRealignment(MF)) {
- MachineInstr *MI =
- BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
- .addReg(StackPtr)
- .addImm(-MaxAlign)
- .setMIFlag(MachineInstr::FrameSetup);
-
- // The EFLAGS implicit def is dead.
- MI->getOperand(3).setIsDead();
- }
} else {
NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
}
@@ -823,6 +815,27 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
}
}
+ // Realign stack after we pushed callee-saved registers (so that we'll be
+ // able to calculate their offsets from the frame pointer).
+
+ // NOTE: We push the registers before realigning the stack, so
+ // vector callee-saved (xmm) registers may be saved w/o proper
+ // alignment in this way. However, currently these regs are saved in
+ // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
+ // this shouldn't be a problem.
+ if (RegInfo->needsStackRealignment(MF)) {
+ assert(HasFP && "There should be a frame pointer if stack is realigned.");
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+ .addReg(StackPtr)
+ .addImm(-MaxAlign)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+
DL = MBB.findDebugLoc(MBBI);
// If there is an SUB32ri of ESP immediately before this instruction, merge
@@ -913,6 +926,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
UseLEA, TII, *RegInfo);
+ // If we need a base pointer, set it up here. It's whatever the value
+ // of the stack pointer is at this point. Any variable size objects
+ // will be allocated after this, so we can still use the base pointer
+ // to reference locals.
+ if (RegInfo->hasBasePointer(MF)) {
+ // Update the frame pointer with the current stack pointer.
+ unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
// Mark end of stack pointer adjustment.
MCSymbol *Label = MMI.getContext().CreateTempSymbol();
@@ -997,10 +1022,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (hasFP(MF)) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
- if (RegInfo->needsStackRealignment(MF))
- FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
-
- NumBytes = FrameSize - CSSize;
+ if (RegInfo->needsStackRealignment(MF)) {
+ // Callee-saved registers were pushed on stack before the stack
+ // was realigned.
+ FrameSize -= CSSize;
+ NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+ } else {
+ NumBytes = FrameSize - CSSize;
+ }
// Pop EBP.
BuildMI(MBB, MBBI, DL,
@@ -1010,7 +1039,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
// Skip the callee-saved pop instructions.
- MachineBasicBlock::iterator LastCSPop = MBBI;
while (MBBI != MBB.begin()) {
MachineBasicBlock::iterator PI = prior(MBBI);
unsigned Opc = PI->getOpcode();
@@ -1021,6 +1049,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
--MBBI;
}
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
DL = MBBI->getDebugLoc();
@@ -1032,28 +1061,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
// realigned.
- if (RegInfo->needsStackRealignment(MF)) {
- // We cannot use LEA here, because stack pointer was realigned. We need to
- // deallocate local frame back.
- if (CSSize) {
- emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII,
- *RegInfo);
- MBBI = prior(LastCSPop);
- }
-
- BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
- StackPtr).addReg(FramePtr);
- } else if (MFI->hasVarSizedObjects()) {
- if (CSSize) {
- unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
- MachineInstr *MI =
- addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
- FramePtr, false, -CSSize);
- MBB.insert(MBBI, MI);
+ if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+ if (RegInfo->needsStackRealignment(MF))
+ MBBI = FirstCSPop;
+ if (CSSize != 0) {
+ unsigned Opc = getLEArOpcode(Is64Bit);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, -CSSize);
} else {
- BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+ unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(FramePtr);
}
} else if (NumBytes) {
@@ -1124,8 +1141,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
MachineInstr *NewMI = prior(MBBI);
- for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
- NewMI->addOperand(MBBI->getOperand(i));
+ NewMI->copyImplicitOps(MBBI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
@@ -1142,16 +1158,25 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
- const X86RegisterInfo *RI =
+ const X86RegisterInfo *RegInfo =
static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
const MachineFrameInfo *MFI = MF.getFrameInfo();
int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
uint64_t StackSize = MFI->getStackSize();
- if (RI->needsStackRealignment(MF)) {
+ if (RegInfo->hasBasePointer(MF)) {
+ assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
if (FI < 0) {
// Skip the saved EBP.
- Offset += RI->getSlotSize();
+ return Offset + RegInfo->getSlotSize();
+ } else {
+ assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ return Offset + StackSize;
+ }
+ } else if (RegInfo->needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ return Offset + RegInfo->getSlotSize();
} else {
assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
return Offset + StackSize;
@@ -1162,7 +1187,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
return Offset + StackSize;
// Skip the saved EBP.
- Offset += RI->getSlotSize();
+ Offset += RegInfo->getSlotSize();
// Skip the RETADDR move area
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1174,6 +1199,22 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
return Offset;
}
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const X86RegisterInfo *RegInfo =
+ static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (RegInfo->hasBasePointer(MF))
+ FrameReg = RegInfo->getBaseRegister();
+ else if (RegInfo->needsStackRealignment(MF))
+ FrameReg = RegInfo->getStackRegister();
+ else
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return getFrameIndexOffset(MF, FI);
+}
+
bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -1307,6 +1348,10 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
"Slot for EBP register must be last in order to be found!");
(void)FrameIdx;
}
+
+ // Spill the BasePtr if it's used.
+ if (RegInfo->hasBasePointer(MF))
+ MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
}
static bool
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d55a497..dc515dc 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -60,6 +60,8 @@ public:
bool hasReservedCallFrame(const MachineFunction &MF) const;
int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const;
uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
};
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8e2b1d6..5186482 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -187,6 +187,7 @@ namespace {
private:
SDNode *Select(SDNode *N);
+ SDNode *SelectGather(SDNode *N, unsigned Opc);
SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
@@ -1905,6 +1906,20 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
ChainCheck = true;
continue;
}
+
+ // Make sure using Op as part of the chain would not cause a cycle here.
+ // In theory, we could check whether the chain node is a predecessor of
+ // the load. But that can be very expensive. Instead visit the uses and
+ // make sure they all have smaller node id than the load.
+ int LoadId = LoadNode->getNodeId();
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = UI->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != 0)
+ continue;
+ if (UI->getNodeId() > LoadId)
+ return false;
+ }
+
ChainOps.push_back(Op);
}
@@ -1938,6 +1953,38 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
llvm_unreachable("unrecognized size for LdVT");
}
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+ // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+ SDValue Chain = Node->getOperand(0);
+ SDValue VSrc = Node->getOperand(2);
+ SDValue Base = Node->getOperand(3);
+ SDValue VIdx = Node->getOperand(4);
+ SDValue VMask = Node->getOperand(5);
+ ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+ if (!Scale)
+ return 0;
+
+ SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+ MVT::Other);
+
+ // Memory Operands: Base, Scale, Index, Disp, Segment
+ SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+ const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+ Disp, Segment, VMask, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+ VTs, Ops, array_lengthof(Ops));
+ // Node has 2 outputs: VDst and MVT::Other.
+ // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+ // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+ // of ResNode.
+ ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+ return ResNode;
+}
+
SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
EVT NVT = Node->getValueType(0);
unsigned Opc, MOpc;
@@ -1953,23 +2000,82 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_avx2_gather_d_pd:
+ case Intrinsic::x86_avx2_gather_d_pd_256:
+ case Intrinsic::x86_avx2_gather_q_pd:
+ case Intrinsic::x86_avx2_gather_q_pd_256:
+ case Intrinsic::x86_avx2_gather_d_ps:
+ case Intrinsic::x86_avx2_gather_d_ps_256:
+ case Intrinsic::x86_avx2_gather_q_ps:
+ case Intrinsic::x86_avx2_gather_q_ps_256:
+ case Intrinsic::x86_avx2_gather_d_q:
+ case Intrinsic::x86_avx2_gather_d_q_256:
+ case Intrinsic::x86_avx2_gather_q_q:
+ case Intrinsic::x86_avx2_gather_q_q_256:
+ case Intrinsic::x86_avx2_gather_d_d:
+ case Intrinsic::x86_avx2_gather_d_d_256:
+ case Intrinsic::x86_avx2_gather_q_d:
+ case Intrinsic::x86_avx2_gather_q_d_256: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
+ case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
+ case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
+ case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
+ case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+ case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
+ case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
+ case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
+ case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
+ case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
+ case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
+ case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
+ case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
+ }
+ SDNode *RetVal = SelectGather(Node, Opc);
+ if (RetVal)
+ // We already called ReplaceUses inside SelectGather.
+ return NULL;
+ break;
+ }
+ }
+ break;
+ }
case X86ISD::GlobalBaseReg:
return getGlobalBaseReg();
+
case X86ISD::ATOMOR64_DAG:
- return SelectAtomic64(Node, X86::ATOMOR6432);
case X86ISD::ATOMXOR64_DAG:
- return SelectAtomic64(Node, X86::ATOMXOR6432);
case X86ISD::ATOMADD64_DAG:
- return SelectAtomic64(Node, X86::ATOMADD6432);
case X86ISD::ATOMSUB64_DAG:
- return SelectAtomic64(Node, X86::ATOMSUB6432);
case X86ISD::ATOMNAND64_DAG:
- return SelectAtomic64(Node, X86::ATOMNAND6432);
case X86ISD::ATOMAND64_DAG:
- return SelectAtomic64(Node, X86::ATOMAND6432);
- case X86ISD::ATOMSWAP64_DAG:
- return SelectAtomic64(Node, X86::ATOMSWAP6432);
+ case X86ISD::ATOMSWAP64_DAG: {
+ unsigned Opc;
+ switch (Opcode) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break;
+ case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break;
+ case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break;
+ case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break;
+ case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+ case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break;
+ case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+ }
+ SDNode *RetVal = SelectAtomic64(Node, Opc);
+ if (RetVal)
+ return RetVal;
+ break;
+ }
case ISD::ATOMIC_LOAD_ADD: {
SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
@@ -2128,7 +2234,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
- N0, SDValue()).getValue(1);
+ N0, SDValue()).getValue(1);
if (foldedLoad) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
@@ -2168,7 +2274,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- LoReg, NVT, InFlag);
+ LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 3042386..b88f2fa 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -63,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
/// simple subregister reference. Idx is an index in the 128 bits we
/// want. It need not be aligned to a 128-bit bounday. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
- DebugLoc dl) {
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, DebugLoc dl) {
EVT VT = Vec.getValueType();
assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
- int Factor = VT.getSizeInBits()/128;
+ unsigned Factor = VT.getSizeInBits()/128;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
// Extract from UNDEF is UNDEF.
if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getNode(ISD::UNDEF, dl, ResultVT);
-
- if (isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return DAG.getUNDEF(ResultVT);
- // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
- // we can match to VEXTRACTF128.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
+ // we can match to VEXTRACTF128.
+ unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
- * ElemsPerChunk);
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ * ElemsPerChunk);
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
- SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
- VecIdx);
+ SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+ VecIdx);
- return Result;
- }
-
- return SDValue();
+ return Result;
}
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
@@ -105,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec,
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit bounday. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result,
- SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
DebugLoc dl) {
- if (isa<ConstantSDNode>(Idx)) {
- EVT VT = Vec.getValueType();
- assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
- EVT ElVT = VT.getVectorElementType();
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- EVT ResultVT = Result.getValueType();
+ EVT VT = Vec.getValueType();
+ assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
- // Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
- // This is the index of the first element of the 128-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
- * ElemsPerChunk);
+ // Insert the relevant 128 bits.
+ unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
- Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
- VecIdx);
- return Result;
- }
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+ * ElemsPerChunk);
- return SDValue();
+ SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+ VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
}
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
@@ -141,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
if (Subtarget->isTargetEnvMacho()) {
if (is64Bit)
- return new X8664_MachoTargetObjectFile();
+ return new X86_64MachoTargetObjectFile();
return new TargetLoweringObjectFileMachO();
}
+ if (Subtarget->isTargetLinux())
+ return new X86LinuxTargetObjectFile();
if (Subtarget->isTargetELF())
return new TargetLoweringObjectFileELF();
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -163,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
TD = getTargetData();
// Set up the TargetLowering object.
- static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+ static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
// X86 is weird, it always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
@@ -172,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
- // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
- if (Subtarget->is64Bit())
+ // For Atom, always use ILP scheduling.
+ if (Subtarget->isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget->is64Bit())
setSchedulingPreference(Sched::ILP);
- else if (Subtarget->isAtom())
- setSchedulingPreference(Sched::Hybrid);
else
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -216,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Set up the register classes.
- addRegisterClass(MVT::i8, X86::GR8RegisterClass);
- addRegisterClass(MVT::i16, X86::GR16RegisterClass);
- addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
if (Subtarget->is64Bit())
- addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -346,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
- for (unsigned i = 0, e = 4; i != e; ++i) {
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
@@ -493,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setShouldFoldAtomicFences(true);
// Expand certain atomics
- for (unsigned i = 0, e = 4; i != e; ++i) {
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -568,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, X86::FR32RegisterClass);
- addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::FR64RegClass);
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS , MVT::f64, Custom);
@@ -600,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
} else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, X86::FR32RegisterClass);
- addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -633,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
} else if (!TM.Options.UseSoftFloat) {
// f32 and f64 in x87.
// Set up the FP register classes.
- addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
- addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
setOperationAction(ISD::UNDEF, MVT::f32, Expand);
@@ -661,7 +662,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// Long double always uses X87.
if (!TM.Options.UseSoftFloat) {
- addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
@@ -706,8 +707,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+ for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
@@ -765,8 +766,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand);
- for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
setTruncStoreAction((MVT::SimpleValueType)VT,
(MVT::SimpleValueType)InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
@@ -777,7 +778,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
- addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
@@ -814,7 +815,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
- addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
@@ -831,14 +832,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
- addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
// registers cannot be used even for integer operations.
- addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
- addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
- addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
- addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
setOperationAction(ISD::ADD, MVT::v16i8, Legal);
setOperationAction(ISD::ADD, MVT::v8i16, Legal);
@@ -875,7 +876,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
- for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
EVT VT = (MVT::SimpleValueType)i;
// Do not attempt to custom lower non-power-of-2 vectors
if (!isPowerOf2_32(VT.getVectorNumElements()))
@@ -904,7 +905,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
- for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
EVT VT = SVT;
@@ -1012,12 +1013,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
- addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
- addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
@@ -1122,8 +1123,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Custom lower several nodes for 256-bit types.
- for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
EVT VT = SVT;
@@ -1145,7 +1146,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
- for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
+ for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
EVT VT = SVT;
@@ -1168,14 +1169,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+ for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+ VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
Custom);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1223,13 +1225,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
- if (Subtarget->hasBMI())
- setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
@@ -1244,6 +1249,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setPrefLoopAlignment(4); // 2^4 bytes.
benefitFromCodePlacementOpt = true;
+ // Predictable cmov don't hurt on atom because it's in-order.
+ predictableSelectIsExpensive = !Subtarget->isAtom();
+
setPrefFunctionAlignment(4); // 2^4 bytes.
}
@@ -1277,7 +1285,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
break;
}
}
- return;
}
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -1412,18 +1419,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
default:
return TargetLowering::findRepresentativeClass(VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
- RRC = (Subtarget->is64Bit()
- ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+ RRC = Subtarget->is64Bit() ?
+ (const TargetRegisterClass*)&X86::GR64RegClass :
+ (const TargetRegisterClass*)&X86::GR32RegClass;
break;
case MVT::x86mmx:
- RRC = X86::VR64RegisterClass;
+ RRC = &X86::VR64RegClass;
break;
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
case MVT::v4f64:
- RRC = X86::VR128RegisterClass;
+ RRC = &X86::VR128RegClass;
break;
}
return std::make_pair(RRC, Cost);
@@ -1458,7 +1466,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
bool
X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
+ MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
@@ -1502,6 +1510,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
SDValue ValToCopy = OutVals[i];
EVT ValVT = ValToCopy.getValueType();
+ // Promote values to the appropriate types
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1639,7 +1657,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget->is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
@@ -1656,7 +1674,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
SDValue Val;
// If this is a call to a function that returns an fp value on the floating
- // point stack, we must guarantee the the value is popped from the stack, so
+ // point stack, we must guarantee the value is popped from the stack, so
// a CopyFromReg is not good enough - the copy instruction may be eliminated
// if the return value is not used. We use the FpPOP_RETVAL instruction
// instead.
@@ -1851,19 +1869,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
EVT RegVT = VA.getLocVT();
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
- RC = X86::GR32RegisterClass;
+ RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
- RC = X86::GR64RegisterClass;
+ RC = &X86::GR64RegClass;
else if (RegVT == MVT::f32)
- RC = X86::FR32RegisterClass;
+ RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
- RC = X86::FR64RegisterClass;
+ RC = &X86::FR64RegClass;
else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
- RC = X86::VR256RegisterClass;
+ RC = &X86::VR256RegClass;
else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
- RC = X86::VR128RegisterClass;
+ RC = &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
- RC = X86::VR64RegisterClass;
+ RC = &X86::VR64RegClass;
else
llvm_unreachable("Unknown argument type!");
@@ -2005,7 +2023,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
DAG.getIntPtrConstant(Offset));
unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2021,7 +2039,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SmallVector<SDValue, 11> SaveXMMOps;
SaveXMMOps.push_back(Chain);
- unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+ unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
SaveXMMOps.push_back(ALVal);
@@ -2032,7 +2050,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
- X86::VR128RegisterClass);
+ &X86::VR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
SaveXMMOps.push_back(Val);
}
@@ -2128,14 +2146,19 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
}
SDValue
-X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
- CallingConv::ID CallConv, bool isVarArg,
- bool doesNotRet, bool &isTailCall,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ DebugLoc &dl = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isTargetWin64();
@@ -2283,27 +2306,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains[0], MemOpChains.size());
- // Build a sequence of copy-to-reg nodes chained together with token chain
- // and flag operands which copy the outgoing args into registers.
- SDValue InFlag;
- // Tail call byval lowering might overwrite argument registers so in case of
- // tail call optimization the copies to registers are lowered later.
- if (!isTailCall)
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
- Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
- InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
@@ -2341,12 +2349,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
- Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
- DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, MVT::i8)));
}
-
// For tail calls lower the arguments to the 'real' stack slot.
if (isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
@@ -2360,8 +2366,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- // Do not flag preceding copytoreg stuff together with the following stuff.
- InFlag = SDValue();
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -2401,19 +2405,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains2[0], MemOpChains2.size());
- // Copy arguments to their registers.
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
- InFlag =SDValue();
-
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
FPDiff, dl);
}
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
@@ -2515,14 +2520,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
- // Add an implicit use GOT pointer in EBX.
- if (!isTailCall && Subtarget->isPICStyleGOT())
- Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
- // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
- if (Is64Bit && isVarArg && !IsWin64)
- Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2744,7 +2741,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ getTargetMachine(), ArgLocs, *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2765,7 +2762,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
@@ -2779,12 +2776,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs1, *DAG.getContext());
+ getTargetMachine(), RVLocs1, *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
SmallVector<CCValAssign, 16> RVLocs2;
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs2, *DAG.getContext());
+ getTargetMachine(), RVLocs2, *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
if (RVLocs1.size() != RVLocs2.size())
@@ -2811,7 +2808,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ getTargetMachine(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
if (Subtarget->isTargetWin64()) {
@@ -2912,6 +2909,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::UNPCKH:
case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:
+ case X86ISD::VPERMI:
return true;
}
}
@@ -3052,10 +3050,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
// X > -1 -> X == 0, jump !sign.
RHS = DAG.getConstant(0, RHS.getValueType());
return X86::COND_NS;
- } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
- } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, RHS.getValueType());
return X86::COND_LE;
@@ -3171,12 +3171,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
return false;
}
-/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (L, L+Pos]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
- int Pos, int Size, int Low) {
- for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ unsigned Pos, unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
@@ -3195,8 +3195,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
- if (VT != MVT::v8i16)
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+ if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
return false;
// Lower quadword copied in order or undef.
@@ -3205,16 +3205,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
// Upper quadword shuffled.
for (unsigned i = 4; i != 8; ++i)
- if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+ if (!isUndefOrInRange(Mask[i], 4, 8))
return false;
+ if (VT == MVT::v16i16) {
+ // Lower quadword copied in order or undef.
+ if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+ return false;
+
+ // Upper quadword shuffled.
+ for (unsigned i = 12; i != 16; ++i)
+ if (!isUndefOrInRange(Mask[i], 12, 16))
+ return false;
+ }
+
return true;
}
/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
- if (VT != MVT::v8i16)
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+ if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
return false;
// Upper quadword copied in order.
@@ -3223,9 +3234,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
// Lower quadword shuffled.
for (unsigned i = 0; i != 4; ++i)
- if (Mask[i] >= 4)
+ if (!isUndefOrInRange(Mask[i], 0, 4))
+ return false;
+
+ if (VT == MVT::v16i16) {
+ // Upper quadword copied in order.
+ if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
return false;
+ // Lower quadword shuffled.
+ for (unsigned i = 8; i != 12; ++i)
+ if (!isUndefOrInRange(Mask[i], 8, 12))
+ return false;
+ }
+
return true;
}
@@ -3419,11 +3441,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
if (NumElems != 2 && NumElems != 4)
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i + NumElems))
return false;
- for (unsigned i = NumElems/2; i != NumElems; ++i)
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
@@ -3439,17 +3461,63 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
|| VT.getSizeInBits() > 128)
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
- if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i + e], i + NumElems))
return false;
return true;
}
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ DebugLoc dl = SVOp->getDebugLoc();
+
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return SDValue();
+
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // These are the special masks that may be optimized.
+ static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ bool MatchEvenMask = true;
+ bool MatchOddMask = true;
+ for (int i=0; i<8; ++i) {
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+ MatchEvenMask = false;
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+ MatchOddMask = false;
+ }
+ static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+ static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+ const int *CompactionMask;
+ if (MatchEvenMask)
+ CompactionMask = CompactionMaskEven;
+ else if (MatchOddMask)
+ CompactionMask = CompactionMaskOdd;
+ else
+ return SDValue();
+
+ SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+ SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+ UndefNode, CompactionMask);
+ SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+ UndefNode, CompactionMask);
+ static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+ return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -3881,9 +3949,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
for (unsigned i = 0; i != NumElts; ++i) {
int Elt = N->getMaskElt(i);
if (Elt < 0) continue;
- Elt %= NumLaneElts;
- unsigned ShAmt = i << Shift;
- if (ShAmt >= 8) ShAmt -= 8;
+ Elt &= NumLaneElts - 1;
+ unsigned ShAmt = (i << Shift) % 8;
Mask |= Elt << ShAmt;
}
@@ -3893,30 +3960,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
unsigned Mask = 0;
- // 8 nodes, but we only care about the last 4.
- for (unsigned i = 7; i >= 4; --i) {
- int Val = N->getMaskElt(i);
- if (Val >= 0)
- Mask |= (Val - 4);
- if (i != 4)
- Mask <<= 2;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the last 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i+4);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits.
+ Mask |= Elt << (i * 2);
+ }
}
+
return Mask;
}
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
unsigned Mask = 0;
- // 8 nodes, but we only care about the first 4.
- for (int i = 3; i >= 0; --i) {
- int Val = N->getMaskElt(i);
- if (Val >= 0)
- Mask |= Val;
- if (i != 0)
- Mask <<= 2;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the first 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits
+ Mask |= Elt << (i * 2);
+ }
}
+
return Mask;
}
@@ -4017,13 +4102,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i != NumElems; ++i) {
- int idx = SVOp->getMaskElt(i);
- if (idx < 0)
- MaskVec.push_back(idx);
- else if (idx < (int)NumElems)
- MaskVec.push_back(idx + NumElems);
- else
- MaskVec.push_back(idx - NumElems);
+ int Idx = SVOp->getMaskElt(i);
+ if (Idx >= 0) {
+ if (Idx < (int)NumElems)
+ Idx += NumElems;
+ else
+ Idx -= NumElems;
+ }
+ MaskVec.push_back(Idx);
}
return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
SVOp->getOperand(0), &MaskVec[0]);
@@ -4108,7 +4194,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
- for (unsigned i = NumElems/2; i != NumElems; ++i)
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i+NumElems))
return false;
return true;
@@ -4160,11 +4246,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
+ unsigned Size = VT.getSizeInBits();
// Always build SSE zero vectors as <4 x i32> bitcasted
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
- if (VT.getSizeInBits() == 128) { // SSE
+ if (Size == 128) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4172,7 +4259,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
- } else if (VT.getSizeInBits() == 256) { // AVX
+ } else if (Size == 256) { // AVX
if (Subtarget->hasAVX2()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4184,7 +4271,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
}
- }
+ } else
+ llvm_unreachable("Unexpected vector type");
+
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -4195,25 +4284,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- assert((VT.is128BitVector() || VT.is256BitVector())
- && "Expected a 128-bit or 256-bit vector type");
+ unsigned Size = VT.getSizeInBits();
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
SDValue Vec;
- if (VT.getSizeInBits() == 256) {
+ if (Size == 256) {
if (HasAVX2) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
- Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
- Vec = Insert128BitVector(InsV, Vec,
- DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+ Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
}
- } else {
+ } else if (Size == 128) {
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- }
+ } else
+ llvm_unreachable("Unexpected vector type");
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -4256,9 +4342,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
- unsigned Half = NumElems/2;
SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != Half; ++i) {
+ for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
Mask.push_back(i + Half);
Mask.push_back(i + NumElems + Half);
}
@@ -4290,15 +4375,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
EVT VT = V.getValueType();
DebugLoc dl = V.getDebugLoc();
- assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
- && "Vector size not supported");
+ unsigned Size = VT.getSizeInBits();
- if (VT.getSizeInBits() == 128) {
+ if (Size == 128) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
&SplatMask[0]);
- } else {
+ } else if (Size == 256) {
// To use VPERMILPS to splat scalars, the second half of indicies must
// refer to the higher part, which is a duplication of the lower one,
// because VPERMILPS can only handle in-lane permutations.
@@ -4308,7 +4392,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
&SplatMask[0]);
- }
+ } else
+ llvm_unreachable("Vector size not supported");
return DAG.getNode(ISD::BITCAST, dl, VT, V);
}
@@ -4329,9 +4414,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
if (Size == 256) {
- unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
- V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
- if (Idx > 0)
+ V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+ if (EltNo >= NumElems/2)
EltNo -= NumElems/2;
}
@@ -4347,10 +4431,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// into the low and high part. This is necessary because we want
// to use VPERM* to shuffle the vectors
if (Size == 256) {
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- V1 = Insert128BitVector(InsV, V1,
- DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
}
return getLegalSplat(DAG, V1, EltNo);
@@ -4378,7 +4459,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
/// target specific opcode. Returns true if the Mask could be calculated.
/// Sets IsUnary to true if only uses one source.
-static bool getTargetShuffleMask(SDNode *N, EVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
SDValue ImmN;
@@ -4409,12 +4490,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT,
break;
case X86ISD::PSHUFHW:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VPERMI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
@@ -4474,20 +4560,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- unsigned NumElems = VT.getVectorNumElements();
+ MVT ShufVT = V.getValueType().getSimpleVT();
+ unsigned NumElems = ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SDValue ImmN;
bool IsUnary;
- if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
+ if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt < 0)
- return DAG.getUNDEF(VT.getVectorElementType());
+ return DAG.getUNDEF(ShufVT.getVectorElementType());
SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
- : N->getOperand(1);
+ : N->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
Depth+1);
}
@@ -4795,7 +4882,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
@@ -4803,7 +4890,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
false, false, false, 0);
SmallVector<int, 8> Mask;
- for (int i = 0; i < NumElems; ++i)
+ for (unsigned i = 0; i != NumElems; ++i)
Mask.push_back(EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
@@ -4867,8 +4954,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
LDBase->getPointerInfo(),
LDBase->isVolatile(), LDBase->isNonTemporal(),
LDBase->isInvariant(), LDBase->getAlignment());
- } else if (NumElems == 4 && LastLoadedElt == 1 &&
- DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+ }
+ if (NumElems == 4 && LastLoadedElt == 1 &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
@@ -4897,6 +4985,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
SDValue Ld;
bool ConstSplatVal;
@@ -4931,8 +5022,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
return SDValue();
SDValue Sc = Op.getOperand(0);
- if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
- return SDValue();
+ if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.is256BitVector())
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
Ld = Sc.getOperand(0);
ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
@@ -4948,7 +5048,6 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
}
bool Is256 = VT.getSizeInBits() == 256;
- bool Is128 = VT.getSizeInBits() == 128;
// Handle the broadcasting a single constant scalar from the constant pool
// into a vector. On Sandybridge it is still better to load a constant vector
@@ -4958,9 +5057,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
assert(!CVT.isVector() && "Must not broadcast a vector type");
unsigned ScalarSize = CVT.getSizeInBits();
- if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
- (Is128 && (ScalarSize == 32))) {
-
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
const Constant *C = 0;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
@@ -4972,40 +5069,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
SDValue CP = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
}
- // The scalar source must be a normal load.
- if (!ISD::isNormalLoad(Ld.getNode()))
- return SDValue();
-
- // Reject loads that have uses of the chain result
- if (Ld->hasAnyUseOfValue(1))
- return SDValue();
-
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
unsigned ScalarSize = Ld.getValueType().getSizeInBits();
- // VBroadcast to YMM
- if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget->hasAVX2() &&
+ (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
- // VBroadcast to XMM
- if (Is128 && (ScalarSize == 32))
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
- // double since there is vbroadcastsd xmm
+ // double since there is no vbroadcastsd xmm
if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
- // VBroadcast to YMM
- if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
-
- // VBroadcast to XMM
- if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64))
+ if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
@@ -5103,8 +5192,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Mask.push_back(Idx);
for (unsigned i = 1; i != VecElts; ++i)
Mask.push_back(i);
- Item = DAG.getVectorShuffle(VecVT, dl, Item,
- DAG.getUNDEF(Item.getValueType()),
+ Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
&Mask[0]);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5137,8 +5225,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
if (VT.getSizeInBits() == 256) {
SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
- Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
- DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
} else {
assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5172,7 +5259,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i < NumElems; i++)
+ for (unsigned i = 0; i != NumElems; ++i)
MaskVec.push_back(i == Idx ? 0 : 1);
return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
}
@@ -5213,10 +5300,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
NumElems/2);
// Recreate the wider vector with the lower and upper part.
- SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
@@ -5383,10 +5467,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
- SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
SDValue
@@ -5408,75 +5489,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
}
// Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
- EVT VT = Op.getValueType();
- EVT InVT = V1.getValueType();
- int MaskSize = VT.getVectorNumElements();
- int InSize = InVT.getVectorNumElements();
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ unsigned NumElems = VT.getVectorNumElements();
if (!Subtarget->hasSSE41())
return SDValue();
- if (MaskSize != InSize)
- return SDValue();
-
- int ISDNo = 0;
+ unsigned ISDNo = 0;
MVT OpTy;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v8i16:
- ISDNo = X86ISD::BLENDPW;
- OpTy = MVT::v8i16;
- break;
+ ISDNo = X86ISD::BLENDPW;
+ OpTy = MVT::v8i16;
+ break;
case MVT::v4i32:
case MVT::v4f32:
- ISDNo = X86ISD::BLENDPS;
- OpTy = MVT::v4f32;
- break;
+ ISDNo = X86ISD::BLENDPS;
+ OpTy = MVT::v4f32;
+ break;
case MVT::v2i64:
case MVT::v2f64:
- ISDNo = X86ISD::BLENDPD;
- OpTy = MVT::v2f64;
- break;
+ ISDNo = X86ISD::BLENDPD;
+ OpTy = MVT::v2f64;
+ break;
case MVT::v8i32:
case MVT::v8f32:
- if (!Subtarget->hasAVX())
- return SDValue();
- ISDNo = X86ISD::BLENDPS;
- OpTy = MVT::v8f32;
- break;
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ ISDNo = X86ISD::BLENDPS;
+ OpTy = MVT::v8f32;
+ break;
case MVT::v4i64:
case MVT::v4f64:
- if (!Subtarget->hasAVX())
- return SDValue();
- ISDNo = X86ISD::BLENDPD;
- OpTy = MVT::v4f64;
- break;
- case MVT::v16i16:
- if (!Subtarget->hasAVX2())
- return SDValue();
- ISDNo = X86ISD::BLENDPW;
- OpTy = MVT::v16i16;
- break;
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ ISDNo = X86ISD::BLENDPD;
+ OpTy = MVT::v4f64;
+ break;
}
assert(ISDNo && "Invalid Op Number");
unsigned MaskVals = 0;
- for (int i = 0; i < MaskSize; ++i) {
+ for (unsigned i = 0; i != NumElems; ++i) {
int EltIdx = SVOp->getMaskElt(i);
- if (EltIdx == i || EltIdx == -1)
+ if (EltIdx == (int)i || EltIdx < 0)
MaskVals |= (1<<i);
- else if (EltIdx == (i + MaskSize))
+ else if (EltIdx == (int)(i + NumElems))
continue; // Bit is set to zero;
- else return SDValue();
+ else
+ return SDValue();
}
V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
@@ -5630,13 +5700,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
- if (TwoInputs && (EltIdx >= 16)) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+ int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+ int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
}
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
@@ -5650,13 +5717,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
- if (EltIdx < 16) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+ int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
}
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
@@ -5732,10 +5796,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
- SDValue ExtOp = (EltIdx < 8)
- ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
- DAG.getIntPtrConstant(EltIdx))
- : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+ SDValue ExtOp = (EltIdx < 8) ?
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+ DAG.getIntPtrConstant(EltIdx)) :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
@@ -5756,21 +5820,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
DebugLoc dl = SVOp->getDebugLoc();
ArrayRef<int> MaskVals = SVOp->getMask();
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
- // FIXME: kill V2Only once shuffles are canonizalized by getNode.
- bool V1Only = true;
- bool V2Only = true;
- for (unsigned i = 0; i < 16; ++i) {
- int EltIdx = MaskVals[i];
- if (EltIdx < 0)
- continue;
- if (EltIdx < 16)
- V2Only = false;
- else
- V1Only = false;
- }
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
@@ -5782,23 +5836,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
- bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
- if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
+ if (EltIdx < 0 || EltIdx >= 16)
+ EltIdx = 0x80;
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
- // If all the elements are from V2, assign it to V1 and return after
- // building the first pshufb.
- if (V2Only)
- V1 = V2;
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
- if (!TwoInputs)
+ if (V2IsUndef)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
@@ -5806,11 +5853,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
- if (EltIdx < 16) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
@@ -5823,7 +5867,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
// the 16 different words that comprise the two doublequadword input vectors.
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
- SDValue NewV = V2Only ? V2 : V1;
+ SDValue NewV = V1;
for (int i = 0; i != 8; ++i) {
int Elt0 = MaskVals[i*2];
int Elt1 = MaskVals[i*2+1];
@@ -5833,9 +5877,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
continue;
// This word of the result is already in the correct place, skip it.
- if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
- continue;
- if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+ if ((Elt0 == i*2) && (Elt1 == i*2+1))
continue;
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
@@ -5897,41 +5939,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG, DebugLoc dl) {
- EVT VT = SVOp->getValueType(0);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned NumElems = VT.getVectorNumElements();
- unsigned NewWidth = (NumElems == 4) ? 2 : 4;
- EVT NewVT;
- switch (VT.getSimpleVT().SimpleTy) {
+ MVT NewVT;
+ unsigned Scale;
+ switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected!");
- case MVT::v4f32: NewVT = MVT::v2f64; break;
- case MVT::v4i32: NewVT = MVT::v2i64; break;
- case MVT::v8i16: NewVT = MVT::v4i32; break;
- case MVT::v16i8: NewVT = MVT::v4i32; break;
+ case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
+ case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
+ case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
+ case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
+ case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+ case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
}
- int Scale = NumElems / NewWidth;
SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i < NumElems; i += Scale) {
+ for (unsigned i = 0; i != NumElems; i += Scale) {
int StartIdx = -1;
- for (int j = 0; j < Scale; ++j) {
+ for (unsigned j = 0; j != Scale; ++j) {
int EltIdx = SVOp->getMaskElt(i+j);
if (EltIdx < 0)
continue;
- if (StartIdx == -1)
- StartIdx = EltIdx - (EltIdx % Scale);
- if (EltIdx != StartIdx + j)
+ if (StartIdx < 0)
+ StartIdx = (EltIdx / Scale);
+ if (EltIdx != (int)(StartIdx*Scale + j))
return SDValue();
}
- if (StartIdx == -1)
- MaskVec.push_back(-1);
- else
- MaskVec.push_back(StartIdx / Scale);
+ MaskVec.push_back(StartIdx);
}
- V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+ SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+ SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
}
@@ -5974,6 +6012,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+ SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
EVT VT = SVOp->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
@@ -5982,14 +6025,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
DebugLoc dl = SVOp->getDebugLoc();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
- SDValue Shufs[2];
+ SDValue Output[2];
SmallVector<int, 16> Mask;
for (unsigned l = 0; l < 2; ++l) {
// Build a shuffle mask for the output, discovering on the fly which
// input vectors to use as shuffle operands (recorded in InputUsed).
// If building a suitable shuffle vector proves too hard, then bail
- // out with useBuildVector set.
+ // out with UseBuildVector set.
+ bool UseBuildVector = false;
int InputUsed[2] = { -1, -1 }; // Not yet discovered.
unsigned LaneStart = l * NumLaneElems;
for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -6021,38 +6065,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
if (OpNo >= array_lengthof(InputUsed)) {
- // More than two input vectors used! Give up.
- return SDValue();
+ // More than two input vectors used! Give up on trying to create a
+ // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
+ UseBuildVector = true;
+ break;
}
// Add the mask index for the new shuffle vector.
Mask.push_back(Idx + OpNo * NumLaneElems);
}
- if (InputUsed[0] < 0) {
+ if (UseBuildVector) {
+ SmallVector<SDValue, 16> SVOps;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ // The mask element. This indexes into the input.
+ int Idx = SVOp->getMaskElt(i+LaneStart);
+ if (Idx < 0) {
+ SVOps.push_back(DAG.getUNDEF(EltVT));
+ continue;
+ }
+
+ // The input vector this mask element indexes into.
+ int Input = Idx / NumElems;
+
+ // Turn the index into an offset from the start of the input vector.
+ Idx -= Input * NumElems;
+
+ // Extract the vector element by hand.
+ SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+ SVOp->getOperand(Input),
+ DAG.getIntPtrConstant(Idx)));
+ }
+
+ // Construct the output using a BUILD_VECTOR.
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+ SVOps.size());
+ } else if (InputUsed[0] < 0) {
// No input vectors were used! The result is undefined.
- Shufs[l] = DAG.getUNDEF(NVT);
+ Output[l] = DAG.getUNDEF(NVT);
} else {
SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
- DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
- DAG, dl);
+ (InputUsed[0] % 2) * NumLaneElems,
+ DAG, dl);
// If only one input was used, use an undefined vector for the other.
SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
- DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
- DAG, dl);
+ (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
// At least one input vector was used. Create a new shuffle vector.
- Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+ Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
}
Mask.clear();
}
// Concatenate the result back
- SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
- DAG, dl);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
}
/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6108,7 +6175,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
- } else if (NumLo == 3 || NumHi == 3) {
+ }
+
+ if (NumLo == 3 || NumHi == 3) {
// Otherwise, we must have three elements from one vector, call it X, and
// one element from the other, call it Y. First, use a shufps to build an
// intermediate vector with the one element from Y and the element from X
@@ -6144,17 +6213,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
Mask1[2] = HiIndex & 1 ? 6 : 4;
Mask1[3] = HiIndex & 1 ? 4 : 6;
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
- } else {
- Mask1[0] = HiIndex & 1 ? 2 : 0;
- Mask1[1] = HiIndex & 1 ? 0 : 2;
- Mask1[2] = PermMask[2];
- Mask1[3] = PermMask[3];
- if (Mask1[2] >= 0)
- Mask1[2] += 4;
- if (Mask1[3] >= 0)
- Mask1[3] += 4;
- return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
}
+
+ Mask1[0] = HiIndex & 1 ? 2 : 0;
+ Mask1[1] = HiIndex & 1 ? 0 : 2;
+ Mask1[2] = PermMask[2];
+ Mask1[3] = PermMask[3];
+ if (Mask1[2] >= 0)
+ Mask1[2] += 4;
+ if (Mask1[3] >= 0)
+ Mask1[3] += 4;
+ return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
}
// Break it into (shuffle shuffle_hi, shuffle_lo).
@@ -6303,7 +6372,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
- // If we don't care about the second element, procede to use movss.
+ // If we don't care about the second element, proceed to use movss.
if (SVOp->getMaskElt(1) != -1)
return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
}
@@ -6361,7 +6430,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
+ VT == MVT::v16i16 || VT == MVT::v32i8) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
@@ -6565,11 +6635,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// new vector_shuffle with the corrected mask.p
SmallVector<int, 8> NewMask(M.begin(), M.end());
NormalizeMask(NewMask, NumElems);
- if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+ if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+ if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
}
if (Commuted) {
@@ -6606,12 +6675,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
}
- if (isPSHUFHWMask(M, VT))
+ if (isPSHUFHWMask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
getShufflePSHUFHWImmediate(SVOp),
DAG);
- if (isPSHUFLWMask(M, VT))
+ if (isPSHUFLWMask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
getShufflePSHUFLWImmediate(SVOp),
DAG);
@@ -6648,7 +6717,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
- SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+ SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
@@ -6715,7 +6784,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT.getSizeInBits() == 16) {
+ }
+
+ if (VT.getSizeInBits() == 16) {
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
// If Idx is 0, it's cheaper to do a move instead of a pextrw.
if (Idx == 0)
@@ -6730,7 +6801,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT == MVT::f32) {
+ }
+
+ if (VT == MVT::f32) {
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
// the result back to FR32 register. It's only worth matching if the
// result has a single use which is a store or a bitcast to i32. And in
@@ -6750,7 +6823,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
Op.getOperand(0)),
Op.getOperand(1));
return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
- } else if (VT == MVT::i32 || VT == MVT::i64) {
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64) {
// ExtractPS/pextrq works with constant index.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return Op;
@@ -6777,12 +6852,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
- bool Upper = IdxVal >= NumElems/2;
- Vec = Extract128BitVector(Vec,
- DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+ Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ if (IdxVal >= NumElems/2)
+ IdxVal -= NumElems/2;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
+ DAG.getConstant(IdxVal, MVT::i32));
}
assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
@@ -6812,7 +6887,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT.getSizeInBits() == 32) {
+ }
+
+ if (VT.getSizeInBits() == 32) {
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
if (Idx == 0)
return Op;
@@ -6824,7 +6901,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0));
- } else if (VT.getSizeInBits() == 64) {
+ }
+
+ if (VT.getSizeInBits() == 64) {
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
@@ -6877,7 +6956,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
if (N2.getValueType() != MVT::i32)
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
- } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+ }
+
+ if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into these
// bits. For example (insert (extract, 3), 2) could be matched by putting
@@ -6890,8 +6971,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
- } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) &&
- isa<ConstantSDNode>(N2)) {
+ }
+
+ if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
// PINSR* works with constant index.
return Op;
}
@@ -6917,16 +6999,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// Get the desired 128-bit vector half.
unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
- bool Upper = IdxVal >= NumElems/2;
- SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
- SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
+ SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired half.
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
- N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
+ bool Upper = IdxVal >= NumElems/2;
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
// Insert the changed part back to the 256-bit vector
- return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
+ return Insert128BitVector(N0, V, IdxVal, DAG, dl);
}
if (Subtarget->hasSSE41())
@@ -6964,19 +7045,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
// Insert the 128-bit vector.
- return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
- DAG.getConstant(0, MVT::i32),
- DAG, dl);
+ return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- if (Op.getValueType() == MVT::v1i64 &&
+ if (OpVT == MVT::v1i64 &&
Op.getOperand(0).getValueType() == MVT::i64)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
- assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
- "Expected an SSE type!");
- return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
+ assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!");
+ return DAG.getNode(ISD::BITCAST, dl, OpVT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
}
@@ -6990,9 +7068,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue Vec = Op.getNode()->getOperand(0);
SDValue Idx = Op.getNode()->getOperand(1);
- if (Op.getNode()->getValueType(0).getSizeInBits() == 128
- && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
- return Extract128BitVector(Vec, Idx, DAG, dl);
+ if (Op.getNode()->getValueType(0).getSizeInBits() == 128 &&
+ Vec.getNode()->getValueType(0).getSizeInBits() == 256 &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Extract128BitVector(Vec, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7009,9 +7089,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue SubVec = Op.getNode()->getOperand(1);
SDValue Idx = Op.getNode()->getOperand(2);
- if (Op.getNode()->getValueType(0).getSizeInBits() == 256
- && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
- return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
+ if (Op.getNode()->getValueType(0).getSizeInBits() == 256 &&
+ SubVec.getNode()->getValueType(0).getSizeInBits() == 128 &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7220,7 +7302,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
- unsigned char OperandFlags) {
+ unsigned char OperandFlags, bool LocalDynamic = false) {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
DebugLoc dl = GA->getDebugLoc();
@@ -7228,12 +7310,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
GA->getValueType(0),
GA->getOffset(),
OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7265,11 +7351,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
X86::RAX, X86II::MO_TLSGD);
}
-// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
-// "local exec" model.
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT,
+ bool is64Bit) {
+ DebugLoc dl = GA->getDebugLoc();
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (is64Bit) {
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT, TLSModel::Model model,
- bool is64Bit) {
+ bool is64Bit, bool isPIC) {
DebugLoc dl = GA->getDebugLoc();
// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
@@ -7287,25 +7411,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
unsigned WrapperKind = X86ISD::Wrapper;
if (model == TLSModel::LocalExec) {
OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
- } else if (is64Bit) {
- assert(model == TLSModel::InitialExec);
- OperandFlags = X86II::MO_GOTTPOFF;
- WrapperKind = X86ISD::WrapperRIP;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
} else {
- assert(model == TLSModel::InitialExec);
- OperandFlags = X86II::MO_INDNTPOFF;
+ llvm_unreachable("Unexpected model");
}
- // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
- // exec)
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
- if (model == TLSModel::InitialExec)
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+ Offset);
+ }
+
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(), false, false, false,
+ 0);
+ }
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
@@ -7319,29 +7454,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
const GlobalValue *GV = GA->getGlobal();
if (Subtarget->isTargetELF()) {
- // TODO: implement the "local dynamic" model
- // TODO: implement the "initial exec"model for pic executables
-
- // If GV is an alias then use the aliasee for determining
- // thread-localness.
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->resolveAliasedGlobal(false);
-
TLSModel::Model model = getTargetMachine().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
- case TLSModel::LocalDynamic: // not implemented
if (Subtarget->is64Bit())
return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+ Subtarget->is64Bit());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
- Subtarget->is64Bit());
+ Subtarget->is64Bit(),
+ getTargetMachine().getRelocationModel() == Reloc::PIC_);
}
- } else if (Subtarget->isTargetDarwin()) {
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget->isTargetDarwin()) {
// Darwin only has one model of TLS. Lower to that.
unsigned char OpFlag = 0;
unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
@@ -7384,7 +7516,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
Chain.getValue(1));
- } else if (Subtarget->isTargetWindows()) {
+ }
+
+ if (Subtarget->isTargetWindows()) {
// Just use the implicit TLS architecture
// Need to generate someting similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7430,7 +7564,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
false, false, false, 0);
SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
- getPointerTy());
+ getPointerTy());
IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
@@ -7694,12 +7828,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
// Handle final rounding.
EVT DestVT = Op.getValueType();
- if (DestVT.bitsLT(MVT::f64)) {
+ if (DestVT.bitsLT(MVT::f64))
return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
DAG.getIntPtrConstant(0));
- } else if (DestVT.bitsGT(MVT::f64)) {
+ if (DestVT.bitsGT(MVT::f64))
return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
- }
// Handle final rounding.
return Sub;
@@ -7720,10 +7853,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
EVT DstVT = Op.getValueType();
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG);
- else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG);
- else if (Subtarget->is64Bit() &&
- SrcVT == MVT::i64 && DstVT == MVT::f32)
+ if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
@@ -7900,9 +8032,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
- else
- // The node is the result.
- return FIST;
+
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7917,9 +8049,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
- else
- // The node is the result.
- return FIST;
+
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7969,12 +8101,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::XOR, dl, XORVT,
- DAG.getNode(ISD::BITCAST, dl, XORVT,
- Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
- } else {
- return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+ DAG.getNode(ISD::BITCAST, dl, XORVT,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
}
+
+ return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
}
SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -8173,7 +8305,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
// Otherwise use a regular EFLAGS-setting instruction.
switch (Op.getNode()->getOpcode()) {
default: llvm_unreachable("unexpected operator!");
- case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::SUB:
+ // If the only use of SUB is EFLAGS, use CMP instead.
+ if (Op.hasOneUse())
+ Opcode = X86ISD::CMP;
+ else
+ Opcode = X86ISD::SUB;
+ break;
case ISD::OR: Opcode = X86ISD::OR; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
@@ -8199,6 +8337,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, Op.getValueType()));
+ if (Opcode == X86ISD::CMP) {
+ SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
+ Op.getOperand(1));
+ // We can't replace usage of SUB with CMP.
+ // The SUB node will be removed later because there is no use of it.
+ return SDValue(New.getNode(), 0);
+ }
+
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0; i != NumOperands; ++i)
@@ -8221,6 +8367,30 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
}
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+ SelectionDAG &DAG) const {
+ // If the subtarget does not support the FUCOMI instruction, floating-point
+ // comparisons have to be converted.
+ if (Subtarget->hasCMov() ||
+ Cmp.getOpcode() != X86ISD::CMP ||
+ !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ return Cmp;
+
+ // The instruction selector will select an FUCOM instruction instead of
+ // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+ // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ DebugLoc dl = Cmp.getDebugLoc();
+ SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+ SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+ DAG.getConstant(8, MVT::i8));
+ SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
/// if it's possible.
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8342,6 +8512,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
}
@@ -8354,21 +8525,19 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
"Unsupported value type for operation");
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
DebugLoc dl = Op.getDebugLoc();
SDValue CC = Op.getOperand(2);
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
MVT EltVT = VT.getVectorElementType().getSimpleVT();
@@ -8438,7 +8607,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
DAG.getConstant(0, MVT::i8));
return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
- } else if (SetCCOpcode == ISD::SETONE) {
+ }
+ if (SetCCOpcode == ISD::SETONE) {
SDValue ORD, NEQ;
ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
DAG.getConstant(7, MVT::i8));
@@ -8511,7 +8681,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getNode()->getOpcode();
- if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::SAHF)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD ||
@@ -8557,6 +8728,46 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond = NewCond;
}
+ // Handle the following cases related to max and min:
+ // (a > b) ? (a-b) : 0
+ // (a >= b) ? (a-b) : 0
+ // (b < a) ? (a-b) : 0
+ // (b <= a) ? (a-b) : 0
+ // Comparison is removed to use EFLAGS from SUB.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2))
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+ (Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) &&
+ C->getAPIntValue() == 0) {
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned CC = cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+ if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) &&
+ DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) &&
+ (CC == X86::COND_G || CC == X86::COND_GE ||
+ CC == X86::COND_A || CC == X86::COND_AE)) ||
+ (DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) &&
+ DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) &&
+ (CC == X86::COND_L || CC == X86::COND_LE ||
+ CC == X86::COND_B || CC == X86::COND_BE))) {
+
+ if (Op1.getOpcode() == ISD::SUB) {
+ SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32);
+ SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs,
+ Op1.getOperand(0), Op1.getOperand(1));
+ DAG.ReplaceAllUsesWith(Op1, New);
+ Op1 = New;
+ }
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+ unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE ||
+ CC == X86::COND_L ||
+ CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE;
+ SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8),
+ SDValue(Op1.getNode(), 1) };
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+ }
+ }
+
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
@@ -8573,8 +8784,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
SDValue CmpOp0 = Cmp.getOperand(0);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+ if (YC->isNullValue() &&
+ (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+ DAG.getConstant(0, CmpOp0.getValueType()),
+ CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
SDValue Res = // Res = 0 or -1.
DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
@@ -8681,6 +8909,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::CMP) {
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -8919,6 +9148,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
@@ -8948,6 +9178,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
@@ -8981,6 +9212,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Cond = EmitTest(Cond, X86::COND_NE, DAG);
}
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cond);
}
@@ -9019,7 +9251,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
const Function *F = MF.getFunction();
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; I++)
+ I != E; ++I)
if (I->hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
@@ -9202,12 +9434,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
if (isa<ConstantSDNode>(ShAmt)) {
+ // Constant may be a TargetConstant. Use a regular constant.
+ uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
switch (Opc) {
default: llvm_unreachable("Unknown target vector shift node");
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
- return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, MVT::i32));
}
}
@@ -9227,7 +9462,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
ShOps[2] = DAG.getUNDEF(MVT::i32);
ShOps[3] = DAG.getUNDEF(MVT::i32);
ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
- ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
@@ -9337,196 +9578,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(X86CC, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
- // XOP comparison intrinsics
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq: {
- unsigned CC = 0;
- unsigned Opc = 0;
-
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- CC = 0;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- CC = 0;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- CC = 1;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- CC = 1;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- CC = 2;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- CC = 2;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- CC = 3;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- CC = 3;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- CC = 4;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- CC = 4;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- CC = 5;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- CC = 5;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- CC = 6;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- CC = 6;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- CC = 7;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq:
- CC = 7;
- Opc = X86ISD::VPCOMU;
- break;
- }
-
- SDValue LHS = Op.getOperand(1);
- SDValue RHS = Op.getOperand(2);
- return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
- DAG.getConstant(CC, MVT::i8));
- }
-
// Arithmetic intrinsics.
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_avx2_pmulu_dq:
@@ -9770,6 +9821,38 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
}
}
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ // RDRAND intrinsics.
+ case Intrinsic::x86_rdrand_16:
+ case Intrinsic::x86_rdrand_32:
+ case Intrinsic::x86_rdrand_64: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+ // return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops, 4);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+ }
+}
+
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -9817,7 +9900,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
@@ -9834,7 +9916,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
- MF.getRegInfo().addLiveOut(StoreAddrReg);
return DAG.getNode(X86ISD::EH_RETURN, dl,
MVT::Other,
@@ -10153,20 +10234,18 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
"Unsupported value type for operation");
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
DebugLoc dl = Op.getDebugLoc();
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10311,6 +10390,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
+ llvm_unreachable("Unknown shift opcode.");
}
if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
@@ -10354,6 +10434,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
+ llvm_unreachable("Unknown shift opcode.");
}
}
}
@@ -10428,9 +10509,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
- SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
- SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+ SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
// Recreate the shift amount vectors
SDValue Amt1, Amt2;
@@ -10449,9 +10529,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
&Amt2Csts[0], NumElems/2);
} else {
// Variable shift amount
- Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
- Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+ Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
}
// Issue new vector shifts for the smaller types
@@ -10561,20 +10640,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return SDValue();
if (!Subtarget->hasAVX2()) {
// needs to be split
- int NumElems = VT.getVectorNumElements();
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+ unsigned NumElems = VT.getVectorNumElements();
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
EVT ExtraEltVT = ExtraVT.getVectorElementType();
- int ExtraNumElems = ExtraVT.getVectorNumElements();
+ unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
ExtraNumElems/2);
SDValue Extra = DAG.getValueType(ExtraVT);
@@ -10860,6 +10937,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
@@ -11119,10 +11197,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
@@ -11191,6 +11271,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
+ case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
}
}
@@ -11259,6 +11341,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return true;
}
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return Imm == (int32_t)Imm;
+}
+
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
@@ -11301,8 +11392,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
isMOVLMask(M, VT) ||
isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
isPSHUFDMask(M, VT) ||
- isPSHUFHWMask(M, VT) ||
- isPSHUFLWMask(M, VT) ||
+ isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
+ isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
isPALIGNRMask(M, VT, Subtarget) ||
isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
@@ -11461,7 +11552,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
// result in out1, out2
// fallthrough -->nextMBB
- const TargetRegisterClass *RC = X86::GR32RegisterClass;
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
const unsigned LoadOpc = X86::MOV32rm;
const unsigned NotOpc = X86::NOT32r;
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11663,7 +11754,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
int valArgIndx = lastAddrIndx + 1;
- unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
for (int i=0; i <= lastAddrIndx; ++i)
(*MIB).addOperand(*argOpers[i]);
@@ -11673,7 +11764,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
argOpers[valArgIndx]->isImm()) &&
"invalid operand");
- unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
if (argOpers[valArgIndx]->isReg())
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
else
@@ -11688,7 +11779,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
MIB.addReg(t2);
// Generate movc
- unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
MIB.addReg(t2);
MIB.addReg(t1);
@@ -12307,8 +12398,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+ .addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
@@ -12518,7 +12610,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// Load the old value of the high byte of the control word...
unsigned OldCW =
- F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+ F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
CWFrameIdx);
@@ -12606,25 +12698,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMOR32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
X86::OR32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMXOR32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
X86::XOR32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMNAND32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
X86::AND32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass, true);
+ &X86::GR32RegClass, true);
case X86::ATOMMIN32:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
case X86::ATOMMAX32:
@@ -12639,25 +12731,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMOR16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
X86::OR16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMXOR16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
X86::XOR16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMNAND16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
X86::AND16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass, true);
+ &X86::GR16RegClass, true);
case X86::ATOMMIN16:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
case X86::ATOMMAX16:
@@ -12672,25 +12764,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMOR8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
X86::OR8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMXOR8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
X86::XOR8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMNAND8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
X86::AND8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass, true);
+ &X86::GR8RegClass, true);
// FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
// This group is for 64-bit host.
case X86::ATOMAND64:
@@ -12698,25 +12790,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMOR64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
X86::OR64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMXOR64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
X86::XOR64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMNAND64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
X86::AND64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass, true);
+ &X86::GR64RegClass, true);
case X86::ATOMMIN64:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
case X86::ATOMMAX64:
@@ -12871,10 +12963,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
/// inserting the result into the low part of a new 256-bit vector
static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+ for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
SVOp->getMaskElt(j) >= 0)
return false;
@@ -12887,10 +12979,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
/// inserting the result into the high part of a new 256-bit vector
static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+ for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
SVOp->getMaskElt(j) >= 0)
return false;
@@ -12907,7 +12999,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -12932,30 +13024,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// To match the shuffle mask, the first half of the mask should
// be exactly the first vector, and all the rest a splat with the
// first element of the second one.
- for (int i = 0; i < NumElems/2; ++i)
+ for (unsigned i = 0; i != NumElems/2; ++i)
if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
!isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
return SDValue();
// If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
- SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
- SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
- Ld->getMemoryVT(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- false/*isVolatile*/, true/*ReadMem*/,
- false/*WriteMem*/);
- return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ if (Ld->hasNUsesOfValue(1, 0)) {
+ SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+ SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+ Ld->getMemoryVT(),
+ Ld->getPointerInfo(),
+ Ld->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+ return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ }
}
// Emit a zeroed vector and insert the desired subvector on its
// first half.
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
- DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}
@@ -12965,18 +13058,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
if (isShuffleHigh128VectorInsertLow(SVOp)) {
- SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
- V, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}
// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
if (isShuffleLow128VectorInsertHigh(SVOp)) {
- SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
- V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
return DCI.CombineTo(N, InsV);
}
@@ -13015,7 +13105,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
}
-/// PerformTruncateCombine - Converts truncate operation to
+/// DCI, PerformTruncateCombine - Converts truncate operation to
/// a sequence of vector shuffle operations.
/// It is possible when we truncate 256-bit vector to 128-bit vector
@@ -13024,7 +13114,8 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps())
return SDValue();
- if (!Subtarget->hasAVX()) return SDValue();
+ if (!Subtarget->hasAVX())
+ return SDValue();
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
@@ -13033,55 +13124,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+ if (Subtarget->hasAVX2()) {
+ // AVX2: v4i64 -> v4i32
+
+ // VPERMD
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+ Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // AVX: v4i64 -> v4i32
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(2));
+ DAG.getIntPtrConstant(2));
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
// PSHUFD
- int ShufMask1[] = {0, 2, 0, 0};
+ static const int ShufMask1[] = {0, 2, 0, 0};
- OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
- ShufMask1);
- OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
- ShufMask1);
+ OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1);
+ OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1);
// MOVLHPS
- int ShufMask2[] = {0, 1, 4, 5};
+ static const int ShufMask2[] = {0, 1, 4, 5};
return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
}
+
if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+ if (Subtarget->hasAVX2()) {
+ // AVX2: v8i32 -> v8i16
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+
+ // PSHUFB
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
+ &pshufbMask[0], 32);
+ Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64),
+ &ShufMask[0]);
+
+ Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+ DAG.getIntPtrConstant(0));
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+ }
+
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(4));
+ DAG.getIntPtrConstant(4));
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
// PSHUFB
- int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
- -1, -1, -1, -1, -1, -1, -1, -1};
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
- OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
- DAG.getUNDEF(MVT::v16i8),
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8),
ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
- DAG.getUNDEF(MVT::v16i8),
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8),
ShufMask1);
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
// MOVLHPS
- int ShufMask2[] = {0, 1, 4, 5};
+ static const int ShufMask2[] = {0, 1, 4, 5};
SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
@@ -13128,7 +13266,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 16> ShuffleMask;
bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+ if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+ UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
@@ -13277,8 +13416,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
-
-
DebugLoc DL = N->getDebugLoc();
SDValue Cond = N->getOperand(0);
// Get the LHS/RHS of the select.
@@ -13560,9 +13697,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// to simplify previous instructions.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
- !DCI.isBeforeLegalize() &&
- TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+ // Don't optimize vector selects that map to mask-registers.
+ if (BitWidth == 1)
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
@@ -14261,6 +14402,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ if (VT.isInteger() && VT.getSizeInBits() == 8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CMOV.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD &&
+ N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA &&
+ N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, VT), N0.getOperand(0));
+
+ SDValue Ops[] = { N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, MVT::i8),
+ SDValue(Neg.getNode(), 1) };
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+ Ops, array_lengthof(Ops));
+ }
+ return SDValue();
+}
+
// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -14268,6 +14444,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (Subtarget->hasCMov()) {
+ SDValue RV = performIntegerAbsCombine(N, DAG);
+ if (RV.getNode())
+ return RV;
+ }
+
+ // Try forming BMI if it is available.
+ if (!Subtarget->hasBMI())
+ return SDValue();
+
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
@@ -14293,7 +14479,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
@@ -14315,63 +14502,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
- // All sizes must be a power of two
- if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
- // Attempt to load the original value using a single load op.
- // Find a scalar type which is equal to the loaded word size.
+ // All sizes must be a power of two.
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+ return SDValue();
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
- break;
}
}
- // Proceed if a load word is found.
- if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
RegSz/SclrLoadTy.getSizeInBits());
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
RegSz/MemVT.getScalarType().getSizeInBits());
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
- // Perform a single load.
- SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
- // Insert the word loaded into a vector.
- SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- LoadUnitVecVT, ScalarLoad);
+ // We can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ptr, Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
- SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
- ScalarInVector);
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio] = i;
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(SlicedVec.getValueType()),
- ShuffleVec.data());
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
// Bitcast to the requested type.
Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
// Replace the original load with the new sequence
// and return the new chain.
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
- return SDValue(ScalarLoad.getNode(), 1);
+ return DCI.CombineTo(N, Shuff, TF, true);
}
return SDValue();
@@ -14388,13 +14606,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we are saving a concatenation of two XMM registers, perform two stores.
- // This is better in Sandy Bridge cause one 256-bit mem op is done via two
- // 128-bit ones. If in the future the cost becomes only one memory access the
- // first version would be better.
- if (VT.getSizeInBits() == 256 &&
- StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
- StoredVal.getNumOperands() == 2) {
-
+ // On Sandy Bridge, 256-bit memory operations are executed by two
+ // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+ // memory operation.
+ if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() &&
+ StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+ StoredVal.getNumOperands() == 2) {
SDValue Value0 = StoredVal.getOperand(0);
SDValue Value1 = StoredVal.getOperand(1);
@@ -14439,14 +14656,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
- // Can't shuffle using an illegal type
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVec.getValueType()),
- ShuffleVec.data());
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
// At this point all of the data is stored at the bottom of the
// register. We now need to save it to mem.
@@ -14455,13 +14674,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
// Bitcast the original vector into a vector of store-size units
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
@@ -14470,7 +14694,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Ptr = St->getBasePtr();
// Perform one or more big stores into memory.
- for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
StoreType, ShuffWide,
DAG.getIntPtrConstant(i));
@@ -14819,18 +15043,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps())
return SDValue();
- if (!Subtarget->hasAVX())
+ if (!Subtarget->hasAVX())
return SDValue();
- // Optimize vectors in AVX mode
- // Sign extend v8i16 to v8i32 and
- // v4i32 to v4i64
- //
- // Divide input vector into two parts
- // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
- // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
- // concat the vectors to original VT
-
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
EVT OpVT = Op.getValueType();
@@ -14839,23 +15054,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
(VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
unsigned NumElems = OpVT.getVectorNumElements();
SmallVector<int,8> ShufMask1(NumElems, -1);
- for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
- ShufMask1.data());
+ &ShufMask1[0]);
SmallVector<int,8> ShufMask2(NumElems, -1);
- for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
- ShufMask2.data());
+ &ShufMask2[0]);
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
VT.getVectorNumElements()/2);
- OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+ OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -14864,6 +15093,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
}
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)
@@ -14888,6 +15118,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
N00.getOperand(0), N00.getOperand(1)),
DAG.getConstant(1, VT));
}
+
// Optimize vectors in AVX mode:
//
// v8i16 -> v8i32
@@ -14900,26 +15131,57 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
//
- if (Subtarget->hasAVX()) {
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
+ if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) {
- SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
- SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
- SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
- EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements()/2);
+ SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+ SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
+ SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
- OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
- }
+ OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+ return SDValue();
+}
+// Optimize x == -y --> x+y == 0
+// x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ LHS.getValueType(), RHS, LHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ RHS.getValueType(), LHS, RHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
return SDValue();
}
@@ -14941,9 +15203,36 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = N->getOperand(0);
+ EVT InVT = Op0->getValueType(0);
+
+ // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
+ if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+ // Notice that we use SINT_TO_FP because we know that the high bits
+ // are zero and SINT_TO_FP is better supported by the hardware.
+ return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+ }
+
+ return SDValue();
+}
+
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
const X86TargetLowering *XTLI) {
SDValue Op0 = N->getOperand(0);
+ EVT InVT = Op0->getValueType(0);
+
+ // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+ if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+ }
+
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
if (Op0.getOpcode() == ISD::LOAD) {
@@ -14962,6 +15251,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
+ if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
+ }
+
+ return SDValue();
+}
+
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15096,9 +15399,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
- case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -15106,9 +15411,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
- case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
+ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGN:
@@ -15653,55 +15960,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// in the normal allocation?
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget->is64Bit()) {
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, X86::GR32RegisterClass);
- else if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16RegisterClass);
- else if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8RegisterClass);
- else if (VT == MVT::i64 || VT == MVT::f64)
- return std::make_pair(0U, X86::GR64RegisterClass);
- break;
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i64 || VT == MVT::f64)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
}
// 32-bit fallthrough
case 'Q': // Q_REGS
if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
- else if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
- else if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
- else if (VT == MVT::i64)
- return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
case 'l': // INDEX_REGS
if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8RegisterClass);
+ return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16RegisterClass);
+ return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
- return std::make_pair(0U, X86::GR32RegisterClass);
- return std::make_pair(0U, X86::GR64RegisterClass);
+ return std::make_pair(0U, &X86::GR32RegClass);
+ return std::make_pair(0U, &X86::GR64RegClass);
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
if (VT == MVT::i32 || !Subtarget->is64Bit())
- return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
- return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
- return std::make_pair(0U, X86::RFP32RegisterClass);
+ return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
- return std::make_pair(0U, X86::RFP64RegisterClass);
- return std::make_pair(0U, X86::RFP80RegisterClass);
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ return std::make_pair(0U, &X86::RFP80RegClass);
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget->hasMMX()) break;
- return std::make_pair(0U, X86::VR64RegisterClass);
+ return std::make_pair(0U, &X86::VR64RegClass);
case 'Y': // SSE_REGS if SSE2 allowed
if (!Subtarget->hasSSE2()) break;
// FALL THROUGH.
@@ -15713,10 +16020,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
- return std::make_pair(0U, X86::FR32RegisterClass);
+ return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
- return std::make_pair(0U, X86::FR64RegisterClass);
+ return std::make_pair(0U, &X86::FR64RegClass);
// Vector types.
case MVT::v16i8:
case MVT::v8i16:
@@ -15724,7 +16031,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
- return std::make_pair(0U, X86::VR128RegisterClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
case MVT::v16i16:
@@ -15732,8 +16039,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
- return std::make_pair(0U, X86::VR256RegisterClass);
-
+ return std::make_pair(0U, &X86::VR256RegClass);
}
break;
}
@@ -15756,28 +16062,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Constraint[6] == '}') {
Res.first = X86::ST0+Constraint[4]-'0';
- Res.second = X86::RFP80RegisterClass;
+ Res.second = &X86::RFP80RegClass;
return Res;
}
// GCC allows "st(0)" to be called just plain "st".
if (StringRef("{st}").equals_lower(Constraint)) {
Res.first = X86::ST0;
- Res.second = X86::RFP80RegisterClass;
+ Res.second = &X86::RFP80RegClass;
return Res;
}
// flags -> EFLAGS
if (StringRef("{flags}").equals_lower(Constraint)) {
Res.first = X86::EFLAGS;
- Res.second = X86::CCRRegisterClass;
+ Res.second = &X86::CCRRegClass;
return Res;
}
// 'A' means EAX + EDX.
if (Constraint == "A") {
Res.first = X86::EAX;
- Res.second = X86::GR32_ADRegisterClass;
+ Res.second = &X86::GR32_ADRegClass;
return Res;
}
return Res;
@@ -15793,7 +16099,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
// really want an 8-bit or 32-bit register, map to the appropriate register
// class and return the appropriate register.
- if (Res.second == X86::GR16RegisterClass) {
+ if (Res.second == &X86::GR16RegClass) {
if (VT == MVT::i8) {
unsigned DestReg = 0;
switch (Res.first) {
@@ -15805,7 +16111,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR8RegisterClass;
+ Res.second = &X86::GR8RegClass;
}
} else if (VT == MVT::i32) {
unsigned DestReg = 0;
@@ -15822,7 +16128,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR32RegisterClass;
+ Res.second = &X86::GR32RegClass;
}
} else if (VT == MVT::i64) {
unsigned DestReg = 0;
@@ -15839,22 +16145,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR64RegisterClass;
+ Res.second = &X86::GR64RegClass;
}
}
- } else if (Res.second == X86::FR32RegisterClass ||
- Res.second == X86::FR64RegisterClass ||
- Res.second == X86::VR128RegisterClass) {
+ } else if (Res.second == &X86::FR32RegClass ||
+ Res.second == &X86::FR64RegClass ||
+ Res.second == &X86::VR128RegClass) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
- if (VT == MVT::f32)
- Res.second = X86::FR32RegisterClass;
- else if (VT == MVT::f64)
- Res.second = X86::FR64RegisterClass;
- else if (X86::VR128RegisterClass->hasType(VT))
- Res.second = X86::VR128RegisterClass;
+
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32RegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64RegClass;
+ else if (X86::VR128RegClass.hasType(VT))
+ Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
}
return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09116e8..78e4d75 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -207,6 +207,10 @@ namespace llvm {
// TLSADDR - Thread Local Storage.
TLSADDR,
+ // TLSBASEADDR - Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
// TLSCALL - Thread Local Storage. When calling to an OS provided
// thunk at the address from an earlier relocation.
TLSCALL,
@@ -242,9 +246,6 @@ namespace llvm {
// PCMP* - Vector integer comparisons.
PCMPEQ, PCMPGT,
- // VPCOM, VPCOMU - XOP Vector integer comparisons.
- VPCOM, VPCOMU,
-
// ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
@@ -315,6 +316,15 @@ namespace llvm {
SFENCE,
LFENCE,
+ // FNSTSW16r - Store FP status word into i16 register.
+ FNSTSW16r,
+
+ // SAHF - Store contents of %ah into %eflags.
+ SAHF,
+
+ // RDRAND - Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
// ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
// ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
// Atomic 64-bit binary operations.
@@ -558,6 +568,18 @@ namespace llvm {
/// by AM is legal for this target, for a load/store of the specified type.
virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ virtual bool isLegalAddImmediate(int64_t Imm) const;
+
/// isTruncateFree - Return true if it's free to truncate a value of
/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
/// register EAX to i16 by referencing its sub-register AX.
@@ -761,6 +783,7 @@ namespace llvm {
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
@@ -797,12 +820,7 @@ namespace llvm {
DebugLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
virtual SDValue
- LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
- bool isVarArg, bool doesNotRet, bool &isTailCall,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+ LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const;
virtual SDValue
@@ -822,9 +840,9 @@ namespace llvm {
virtual bool
CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- LLVMContext &Context) const;
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const;
void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG, unsigned NewOp) const;
@@ -909,6 +927,9 @@ namespace llvm {
/// equivalent, for use with the given x86 condition code.
SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
SelectionDAG &DAG) const;
+
+ /// Convert a comparison if required by the subtarget.
+ SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
};
namespace X86 {
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 0eee083..b6ba68f 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1143,7 +1143,9 @@ let Uses = [EFLAGS] in {
0, 0>;
}
+let isCompare = 1 in {
defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
//===----------------------------------------------------------------------===//
@@ -1154,7 +1156,7 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
(X86cmp (and_su node:$lhs, node:$rhs), 0)>;
-let Defs = [EFLAGS] in {
+let isCompare = 1, Defs = [EFLAGS] in {
let isCommutable = 1 in {
def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index fa1d676..aaef4a4 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -55,11 +55,11 @@ struct X86AddressMode {
: BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
Base.Reg = 0;
}
-
-
+
+
void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
-
+
if (BaseType == X86AddressMode::RegBase)
MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
false, false, false, 0, false));
@@ -67,16 +67,16 @@ struct X86AddressMode {
assert(BaseType == X86AddressMode::FrameIndexBase);
MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
}
-
+
MO.push_back(MachineOperand::CreateImm(Scale));
MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
false, false, false, 0, false));
-
+
if (GV)
MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
else
MO.push_back(MachineOperand::CreateImm(Disp));
-
+
MO.push_back(MachineOperand::CreateReg(0, false, false,
false, false, false, 0, false));
}
@@ -122,7 +122,7 @@ static inline const MachineInstrBuilder &
addFullAddress(const MachineInstrBuilder &MIB,
const X86AddressMode &AM) {
assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
-
+
if (AM.BaseType == X86AddressMode::RegBase)
MIB.addReg(AM.Base.Reg);
else {
@@ -135,7 +135,7 @@ addFullAddress(const MachineInstrBuilder &MIB,
MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
else
MIB.addImm(AM.Disp);
-
+
return MIB.addReg(0);
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 6f9e849..99c2b8f 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -375,11 +375,16 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [ESP] in
+ Uses = [ESP] in {
def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLS_addr32",
[(X86tlsaddr tls32addr:$sym)]>,
Requires<[In32BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addr32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[In32BitMode]>;
+}
// All calls clobber the non-callee saved registers. RSP is marked as
// a use to prevent stack-pointer assignments that appear immediately
@@ -389,11 +394,16 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [RSP] in
+ Uses = [RSP] in {
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
[(X86tlsaddr tls64addr:$sym)]>,
Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+ "# TLS_base_addr64",
+ [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+ Requires<[In64BitMode]>;
+}
// Darwin TLS Support
// For i386, the address of the thunk is passed on the stack, on return the
@@ -1008,8 +1018,8 @@ def : Pat<(X86call (i64 texternalsym:$dst)),
(CALL64pcrel32 texternalsym:$dst)>;
// tailcall stuff
-def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
- (TCRETURNri GR32_TC:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[In32BitMode]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
@@ -1623,6 +1633,12 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
(SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
// mul reg, reg
def : Pat<(mul GR16:$src1, GR16:$src2),
(IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index bf11fde..b0c27c8 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -18,16 +18,16 @@
// Return instructions.
let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, FPForm = SpecialFP in {
- def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ def RET : I <0xC3, RawFrm, (outs), (ins),
"ret",
[(X86retflag 0)], IIC_RET>;
- def RETW : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ def RETW : I <0xC3, RawFrm, (outs), (ins),
"ret{w}",
[], IIC_RET>, OpSize;
- def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
"ret\t$amt",
[(X86retflag timm:$amt)], IIC_RET_IMM>;
- def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
"ret{w}\t$amt",
[], IIC_RET_IMM>, OpSize;
def LRETL : I <0xCB, RawFrm, (outs), (ins),
@@ -148,12 +148,12 @@ let isCall = 1 in
// registers are added manually.
let Uses = [ESP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i32imm_pcrel:$dst,variable_ops),
+ (outs), (ins i32imm_pcrel:$dst),
"call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
- def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
Requires<[In32BitMode]>;
- def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
Requires<[In32BitMode]>;
@@ -174,7 +174,7 @@ let isCall = 1 in
// callw for 16 bit code for the assembler.
let isAsmParserOnly = 1 in
def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
- (outs), (ins i16imm_pcrel:$dst, variable_ops),
+ (outs), (ins i16imm_pcrel:$dst),
"callw\t$dst", []>, OpSize;
}
@@ -185,23 +185,23 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1 in
let Uses = [ESP] in {
def TCRETURNdi : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
+ (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
def TCRETURNri : PseudoI<(outs),
- (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
let mayLoad = 1 in
def TCRETURNmi : PseudoI<(outs),
- (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+ (ins i32mem_TC:$dst, i32imm:$offset), []>;
// FIXME: The should be pseudo instructions that are lowered when going to
// mcinst.
def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i32imm_pcrel:$dst, variable_ops),
+ (ins i32imm_pcrel:$dst),
"jmp\t$dst # TAILCALL",
[], IIC_JMP_REL>;
- def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
let mayLoad = 1 in
- def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
"jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
}
@@ -218,14 +218,14 @@ let isCall = 1, Uses = [RSP] in {
// that the offset between an arbitrary immediate and the call will fit in
// the 32-bit pcrel field that we have.
def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+ (outs), (ins i64i32imm_pcrel:$dst),
"call{q}\t$dst", [], IIC_CALL_RI>,
Requires<[In64BitMode]>;
- def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
"call{q}\t{*}$dst", [(X86call GR64:$dst)],
IIC_CALL_RI>,
Requires<[In64BitMode]>;
- def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
IIC_CALL_MEM>,
Requires<[In64BitMode]>;
@@ -240,7 +240,7 @@ let isCall = 1, isCodeGenOnly = 1 in
let Defs = [RAX, R10, R11, RSP, EFLAGS],
Uses = [RSP] in {
def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+ (outs), (ins i64i32imm_pcrel:$dst),
"call{q}\t$dst", [], IIC_CALL_RI>,
Requires<[IsWin64]>;
}
@@ -250,21 +250,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
let Uses = [RSP],
usesCustomInserter = 1 in {
def TCRETURNdi64 : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
[]>;
def TCRETURNri64 : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
- (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+ (ins i64mem_TC:$dst, i32imm:$offset), []>;
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst, variable_ops),
+ (ins i64i32imm_pcrel:$dst),
"jmp\t$dst # TAILCALL", [], IIC_JMP_REL>;
- def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops),
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
let mayLoad = 1 in
- def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
+ def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
"jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
}
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index d57937b..8802a2e 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -15,83 +15,161 @@
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
+let Constraints = "$src1 = $dst" in {
multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
+let neverHasSideEffects = 1 in {
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
+ let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
+ let mayLoad = 1 in
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, f256mem:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
+} // neverHasSideEffects = 1
+}
+
+// Intrinsic for 132 pattern
+multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ Intrinsic Int128, Intrinsic Int256> {
+ def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src3, VR128:$src2))]>;
+ def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (Int128 VR128:$src1, (MemFrag128 addr:$src3), VR128:$src2))]>;
+ def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src3, VR256:$src2))]>;
+ def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (Int256 VR256:$src1, (MemFrag256 addr:$src3), VR256:$src2))]>;
}
+} // Constraints = "$src1 = $dst"
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpcodeStr, string PackTy> {
- defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
- defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
- defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+ string OpcodeStr, string PackTy,
+ PatFrag MemFrag128, PatFrag MemFrag256,
+ Intrinsic Int128, Intrinsic Int256> {
+ defm r132 : fma3p_rm_int <opc132, !strconcat(OpcodeStr,
+ !strconcat("132", PackTy)), MemFrag128, MemFrag256,
+ Int128, Int256>;
+ defm r132 : fma3p_rm <opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+ defm r213 : fma3p_rm <opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
+ defm r231 : fma3p_rm <opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
}
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
- defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
- defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
- defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
- defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
+ defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
+ memopv8f32, int_x86_fma_vfmadd_ps, int_x86_fma_vfmadd_ps_256>;
+ defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
+ memopv8f32, int_x86_fma_vfmsub_ps, int_x86_fma_vfmsub_ps_256>;
+ defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+ memopv4f32, memopv8f32, int_x86_fma_vfmaddsub_ps,
+ int_x86_fma_vfmaddsub_ps_256>;
+ defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+ memopv4f32, memopv8f32, int_x86_fma_vfmsubadd_ps,
+ int_x86_fma_vfmaddsub_ps_256>;
}
let ExeDomain = SSEPackedDouble in {
- defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
- defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
- defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
- defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
+ defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfmadd_pd, int_x86_fma_vfmadd_pd_256>, VEX_W;
+ defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfmsub_pd, int_x86_fma_vfmsub_pd_256>, VEX_W;
+ defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfmaddsub_pd, int_x86_fma_vfmaddsub_pd_256>, VEX_W;
+ defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfmsubadd_pd, int_x86_fma_vfmsubadd_pd_256>, VEX_W;
}
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
- defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
- defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
+ defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32,
+ memopv8f32, int_x86_fma_vfnmadd_ps, int_x86_fma_vfnmadd_ps_256>;
+ defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32,
+ memopv8f32, int_x86_fma_vfnmsub_ps, int_x86_fma_vfnmsub_ps_256>;
}
let ExeDomain = SSEPackedDouble in {
- defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
- defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+ defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfnmadd_pd, int_x86_fma_vfnmadd_pd_256>, VEX_W;
+ defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", memopv2f64,
+ memopv4f64, int_x86_fma_vfnmsub_pd, int_x86_fma_vfnmsub_pd_256>, VEX_W;
}
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
- def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+ RegisterClass RC> {
+let neverHasSideEffects = 1 in {
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
- def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, x86memop:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
+} // neverHasSideEffects = 1
}
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
+ ComplexPattern mem_cpat, Intrinsic IntId> {
+ def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src3, VR128:$src2))]>;
+ def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1, mem_cpat:$src3, VR128:$src2))]>;
+}
+} // Constraints = "$src1 = $dst"
+
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpcodeStr> {
- defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>;
- defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>;
- defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>;
- defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W;
- defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W;
- defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W;
+ string OpStr, Intrinsic IntF32, Intrinsic IntF64> {
+ defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
+ defm SSr213 : fma3s_rm<opc213, !strconcat(OpStr, "213ss"), f32mem, FR32>;
+ defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
+ defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>, VEX_W;
+ defm SDr213 : fma3s_rm<opc213, !strconcat(OpStr, "213sd"), f64mem, FR64>, VEX_W;
+ defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>, VEX_W;
+ defm SSr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132ss"), ssmem,
+ sse_load_f32, IntF32>;
+ defm SDr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132sd"), sdmem,
+ sse_load_f64, IntF64>;
}
-defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG;
-defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG;
+defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+ int_x86_fma_vfmadd_sd>, VEX_LIG;
+defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+ int_x86_fma_vfmsub_sd>, VEX_LIG;
+
+defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+ int_x86_fma_vfnmadd_sd>, VEX_LIG;
+defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+ int_x86_fma_vfnmsub_sd>, VEX_LIG;
-defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG;
-defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -178,43 +256,47 @@ let isCodeGenOnly = 1 in {
} // isCodeGenOnly = 1
}
+let Predicates = [HasFMA4] in {
+
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
- int_x86_fma4_vfmadd_ss>;
+ int_x86_fma_vfmadd_ss>;
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
- int_x86_fma4_vfmadd_sd>;
-defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps,
- int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd,
- int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>;
+ int_x86_fma_vfmadd_sd>;
+defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
+ int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
+ int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
- int_x86_fma4_vfmsub_ss>;
+ int_x86_fma_vfmsub_ss>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
- int_x86_fma4_vfmsub_sd>;
-defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps,
- int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd,
- int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>;
+ int_x86_fma_vfmsub_sd>;
+defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
+ int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
+ int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
- int_x86_fma4_vfnmadd_ss>;
+ int_x86_fma_vfnmadd_ss>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
- int_x86_fma4_vfnmadd_sd>;
-defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps,
- int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd,
- int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>;
+ int_x86_fma_vfnmadd_sd>;
+defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
+ int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
+ int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
- int_x86_fma4_vfnmsub_ss>;
+ int_x86_fma_vfnmsub_ss>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
- int_x86_fma4_vfnmsub_sd>;
-defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps,
- int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd,
- int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps,
- int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd,
- int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps,
- int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd,
- int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+ int_x86_fma_vfnmsub_sd>;
+defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
+ int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
+ int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
+ int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
+ int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
+ int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
+ int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+} // HasFMA4
+
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index a13887e..568726e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -27,6 +27,7 @@ def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
SDTCisVT<2, OtherVT>]>;
def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -41,6 +42,7 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
SDNPMemOperand]>;
+def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
@@ -203,6 +205,7 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
}
}
+let Defs = [FPSW] in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
@@ -213,6 +216,7 @@ defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
defm MUL : FPBinary<fmul, MRM1m, "mul">;
defm DIV : FPBinary<fdiv, MRM6m, "div">;
defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
class FPST0rInst<bits<8> o, string asm>
: FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
@@ -257,6 +261,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
}
+let Defs = [FPSW] in {
defm CHS : FPUnary<fneg, 0xE0, "fchs">;
defm ABS : FPUnary<fabs, 0xE1, "fabs">;
defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
@@ -269,6 +274,7 @@ def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
}
def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+} // Defs = [FPSW]
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
@@ -316,6 +322,7 @@ multiclass FPCMov<PatLeaf cc> {
Requires<[HasCMov]>;
}
+let Defs = [FPSW] in {
let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
defm CMOVB : FPCMov<X86_COND_B>;
defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -416,24 +423,40 @@ def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
}
let mayLoad = 1 in {
-def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
-def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
-def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
-def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
-def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
-def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+ IIC_FLD>;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+ IIC_FLD>;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+ IIC_FLD80>;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+ IIC_FILD>;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+ IIC_FILD>;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+ IIC_FILD>;
}
let mayStore = 1 in {
-def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
-def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
-def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
-def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
-def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
-def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
-def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
-def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
-def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
-def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+ IIC_FST>;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+ IIC_FST>;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+ IIC_FST>;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+ IIC_FST>;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+ IIC_FST80>;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+ IIC_FIST>;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+ IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+ IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+ IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+ IIC_FIST>;
}
// FISTTP requires SSE3 even though it's a FPStack op.
@@ -459,17 +482,23 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
} // Predicates = [HasSSE3]
let mayStore = 1 in {
-def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
-def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+ IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+ IIC_FST>;
def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
- "fisttp{ll}\t$dst">;
+ "fisttp{ll}\t$dst", IIC_FST>;
}
// FP Stack manipulation instructions.
-def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
-def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
-def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
-def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
+ IIC_FLD>, D9;
+def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
+ IIC_FST>, DD;
+def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
+ IIC_FST>, DD;
+def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
+ IIC_FXCH>, D9;
// Floating point constant loads.
let isReMaterializable = 1 in {
@@ -487,20 +516,21 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
-def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
-def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
// Floating point compares.
-let Defs = [EFLAGS] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- []>; // FPSW = cmp ST(0) with ST(i)
+ [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- []>; // FPSW = cmp ST(0) with ST(i)
+ [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- []>; // FPSW = cmp ST(0) with ST(i)
-
+ [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // Defs = [FPSW]
+
// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -509,85 +539,94 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
[(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
}
-let Defs = [EFLAGS], Uses = [ST0] in {
+let Defs = [FPSW], Uses = [ST0] in {
def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i)
(outs), (ins RST:$reg),
- "fucom\t$reg">, DD;
+ "fucom\t$reg", IIC_FUCOM>, DD;
def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop
(outs), (ins RST:$reg),
- "fucomp\t$reg">, DD;
+ "fucomp\t$reg", IIC_FUCOM>, DD;
def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop
(outs), (ins),
- "fucompp">, DA;
+ "fucompp", IIC_FUCOM>, DA;
+}
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i)
(outs), (ins RST:$reg),
- "fucomi\t$reg">, DB;
+ "fucomi\t$reg", IIC_FUCOMI>, DB;
def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop
(outs), (ins RST:$reg),
- "fucompi\t$reg">, DF;
+ "fucompi\t$reg", IIC_FUCOMI>, DF;
}
+let Defs = [EFLAGS, FPSW] in {
def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
- "fcomi\t$reg">, DB;
+ "fcomi\t$reg", IIC_FCOMI>, DB;
def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
- "fcompi\t$reg">, DF;
+ "fcompi\t$reg", IIC_FCOMI>, DF;
+}
// Floating point flag ops.
-let Defs = [AX] in
-def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags
- (outs), (ins), "fnstsw %ax", []>, DF;
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags
+ (outs), (ins), "fnstsw %ax",
+ [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
- [(X86fp_cwd_get16 addr:$dst)]>;
+ [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
let mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
- (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>;
// FPU control instructions
-def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB;
+let Defs = [FPSW] in
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
- "ffree\t$reg">, DD;
+ "ffree\t$reg", IIC_FFREE>, DD;
// Clear exceptions
-def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB;
+let Defs = [FPSW] in
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
// Operandless floating-point instructions for the disassembler.
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
-
-def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9;
-def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9;
-def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9;
-def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9;
-def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9;
-def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9;
-def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9;
-def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9;
-def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9;
-def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9;
-def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9;
-def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9;
-def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9;
-def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9;
-def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9;
-def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9;
-def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9;
-def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9;
-def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9;
-def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9;
-def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE;
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
- "fxsave\t$dst", []>, TB;
+ "fxsave\t$dst", [], IIC_FXSAVE>, TB;
def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
- "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+ "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
+ Requires<[In64BitMode]>;
def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor\t$src", []>, TB;
+ "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>;
+ "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+ Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index b387090..a115ab4 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -255,8 +255,9 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
// FPStack Instruction Templates:
// FPI - Floating Point Instruction template.
-class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
- : I<o, F, outs, ins, asm, []> {}
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, [], itin> {}
// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
@@ -365,6 +366,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
//
// SDI - SSE2 instructions with XD prefix.
// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// SSDI - SSE2 instructions with XS prefix.
// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
// PDI - SSE2 instructions with TB and OpSize prefixes.
// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
@@ -377,8 +379,11 @@ class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
: Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class SSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag> pattern>
+ list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
: Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
@@ -503,29 +508,29 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
- Requires<[HasSSE2, HasAES]>;
+ Requires<[HasAES]>;
class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
- Requires<[HasSSE2, HasAES]>;
+ Requires<[HasAES]>;
-// CLMUL Instruction Templates
-class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
- OpSize, Requires<[HasSSE2, HasCLMUL]>;
+ OpSize, Requires<[HasPCLMUL]>;
-class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
- OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>;
+ OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
// FMA3 Instruction Templates
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
: I<o, F, outs, ins, asm, pattern, itin>, T8,
- OpSize, VEX_4V, Requires<[HasFMA3]>;
+ OpSize, VEX_4V, Requires<[HasFMA]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 35801e4..ec030dd 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisOpSmallerThanOp<1, 0> ]>>;
+
def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
-
+
def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
@@ -102,13 +107,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
-def X86vpcom : SDNode<"X86ISD::VPCOM",
- SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-def X86vpcomu : SDNode<"X86ISD::VPCOMU",
- SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<1,2>]>>;
@@ -304,7 +302,7 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
}]>;
def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
+ (st node:$val, node:$ptr), [{
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
return ST->isNonTemporal() && !ST->isTruncatingStore() &&
ST->getAddressingMode() == ISD::UNINDEXED &&
@@ -313,7 +311,7 @@ def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
}]>;
def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
+ (st node:$val, node:$ptr), [{
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
return ST->isNonTemporal() &&
ST->getAlignment() < 16;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index b12c1db..69493bc 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/LLVMContext.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -54,38 +55,39 @@ ReMatPICStubLoad("remat-pic-stub-load",
enum {
// Select which memory operand is being unfolded.
- // (stored in bits 0 - 7)
+ // (stored in bits 0 - 3)
TB_INDEX_0 = 0,
TB_INDEX_1 = 1,
TB_INDEX_2 = 2,
- TB_INDEX_MASK = 0xff,
-
- // Minimum alignment required for load/store.
- // Used for RegOp->MemOp conversion.
- // (stored in bits 8 - 15)
- TB_ALIGN_SHIFT = 8,
- TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
- TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
- TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
- TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT,
+ TB_INDEX_3 = 3,
+ TB_INDEX_MASK = 0xf,
// Do not insert the reverse map (MemOp -> RegOp) into the table.
// This may be needed because there is a many -> one mapping.
- TB_NO_REVERSE = 1 << 16,
+ TB_NO_REVERSE = 1 << 4,
// Do not insert the forward map (RegOp -> MemOp) into the table.
// This is needed for Native Client, which prohibits branch
// instructions from using a memory operand.
- TB_NO_FORWARD = 1 << 17,
+ TB_NO_FORWARD = 1 << 5,
+
+ TB_FOLDED_LOAD = 1 << 6,
+ TB_FOLDED_STORE = 1 << 7,
- TB_FOLDED_LOAD = 1 << 18,
- TB_FOLDED_STORE = 1 << 19
+ // Minimum alignment required for load/store.
+ // Used for RegOp->MemOp conversion.
+ // (stored in bits 8 - 15)
+ TB_ALIGN_SHIFT = 8,
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
};
struct X86OpTblEntry {
uint16_t RegOp;
uint16_t MemOp;
- uint32_t Flags;
+ uint16_t Flags;
};
X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
@@ -408,14 +410,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
{ X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
{ X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
- { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm, TB_ALIGN_16 },
- { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm, TB_ALIGN_16 },
- { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm, TB_ALIGN_16 },
- { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm, 0 },
{ X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
{ X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
{ X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
@@ -492,14 +490,20 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
// AVX 128-bit versions of foldable instructions
{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
- { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm, TB_ALIGN_16 },
- { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm, TB_ALIGN_16 },
- { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm, TB_ALIGN_16 },
- { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm, TB_ALIGN_16 },
- { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm, 0 },
{ X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
{ X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
+ { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+ { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
+ { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
{ X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE },
{ X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
@@ -535,6 +539,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 },
{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
+ { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
+
// AVX 256-bit foldable instructions
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
@@ -543,6 +549,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
{ X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 },
{ X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 },
+
// AVX2 foldable instructions
{ X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 },
{ X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 },
@@ -558,6 +565,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 },
{ X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 },
{ X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 },
+ { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
};
for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -808,17 +817,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
- { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 },
- { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::Int_VCVTTSD2SIrr, X86::Int_VCVTTSD2SIrm, 0 },
- { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 },
- { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::Int_VCVTTSS2SIrr, X86::Int_VCVTTSS2SIrm, 0 },
- { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
- { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, TB_ALIGN_16 },
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, TB_ALIGN_16 },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
@@ -1122,6 +1121,158 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
// Index 2, folded load
Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
}
+
+ static const X86OpTblEntry OpTbl3[] = {
+ // FMA foldable instructions
+ { X86::VFMADDSSr231r, X86::VFMADDSSr231m, 0 },
+ { X86::VFMADDSDr231r, X86::VFMADDSDr231m, 0 },
+ { X86::VFMADDSSr132r, X86::VFMADDSSr132m, 0 },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 },
+ { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 },
+ { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 },
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, 0 },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, 0 },
+
+ { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 },
+ { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_16 },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_16 },
+ { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_16 },
+ { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_16 },
+ { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 },
+ { X86::VFMADDPSr132r_Int, X86::VFMADDPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFMADDPDr132r_Int, X86::VFMADDPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFMADDPSr132rY_Int, X86::VFMADDPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDPDr132rY_Int, X86::VFMADDPDr132mY_Int, TB_ALIGN_32 },
+
+ { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 },
+ { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 },
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, 0 },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 },
+ { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 },
+ { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 },
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, 0 },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, 0 },
+
+ { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_16 },
+ { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_16 },
+ { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 },
+ { X86::VFNMADDPSr132r_Int, X86::VFNMADDPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFNMADDPDr132r_Int, X86::VFNMADDPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFNMADDPSr132rY_Int, X86::VFNMADDPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFNMADDPDr132rY_Int, X86::VFNMADDPDr132mY_Int, TB_ALIGN_32 },
+
+ { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 },
+ { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 },
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, 0 },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 },
+ { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 },
+ { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 },
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, 0 },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, 0 },
+
+ { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBPSr132r_Int, X86::VFMSUBPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBPDr132r_Int, X86::VFMSUBPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBPSr132rY_Int, X86::VFMSUBPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBPDr132rY_Int, X86::VFMSUBPDr132mY_Int, TB_ALIGN_32 },
+
+ { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 },
+ { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 },
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, 0 },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 },
+ { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 },
+ { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 },
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, 0 },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, 0 },
+
+ { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr132r_Int, X86::VFNMSUBPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr132r_Int, X86::VFNMSUBPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr132rY_Int, X86::VFNMSUBPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr132rY_Int, X86::VFNMSUBPDr132mY_Int, TB_ALIGN_32 },
+
+ { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr132r_Int, X86::VFMADDSUBPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr132r_Int, X86::VFMADDSUBPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr132rY_Int, X86::VFMADDSUBPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr132rY_Int, X86::VFMADDSUBPDr132mY_Int, TB_ALIGN_32 },
+
+ { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr132r_Int, X86::VFMSUBADDPSr132m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr132r_Int, X86::VFMSUBADDPDr132m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr132rY_Int, X86::VFMSUBADDPSr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr132rY_Int, X86::VFMSUBADDPDr132mY_Int, TB_ALIGN_32 },
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+ unsigned RegOp = OpTbl3[i].RegOp;
+ unsigned MemOp = OpTbl3[i].MemOp;
+ unsigned Flags = OpTbl3[i].Flags;
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ RegOp, MemOp,
+ // Index 3, folded load
+ Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+
}
void
@@ -1782,12 +1933,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
+ (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+ (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
// LEA can't handle RSP.
if (TargetRegisterInfo::isVirtualRegister(Src) &&
- !MF.getRegInfo().constrainRegClass(Src,
- MIOpc == X86::INC64r ? X86::GR64_NOSPRegisterClass :
- X86::GR32_NOSPRegisterClass))
+ !MF.getRegInfo().constrainRegClass(Src, RC))
return 0;
NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1812,11 +1964,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
+ (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+ (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
// LEA can't handle RSP.
if (TargetRegisterInfo::isVirtualRegister(Src) &&
- !MF.getRegInfo().constrainRegClass(Src,
- MIOpc == X86::DEC64r ? X86::GR64_NOSPRegisterClass :
- X86::GR32_NOSPRegisterClass))
+ !MF.getRegInfo().constrainRegClass(Src, RC))
return 0;
NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1844,10 +1997,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
const TargetRegisterClass *RC;
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
Opc = X86::LEA64r;
- RC = X86::GR64_NOSPRegisterClass;
+ RC = &X86::GR64_NOSPRegClass;
} else {
Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
- RC = X86::GR32_NOSPRegisterClass;
+ RC = &X86::GR32_NOSPRegClass;
}
@@ -1863,6 +2016,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
.addReg(Dest, RegState::Define |
getDeadRegState(isDead)),
Src, isKill, Src2, isKill2);
+
+ // Preserve undefness of the operands.
+ bool isUndef = MI->getOperand(1).isUndef();
+ bool isUndef2 = MI->getOperand(2).isUndef();
+ NewMI->getOperand(1).setIsUndef(isUndef);
+ NewMI->getOperand(3).setIsUndef(isUndef2);
+
if (LV && isKill2)
LV->replaceKillInstruction(Src2, MI, NewMI);
break;
@@ -2079,7 +2239,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
}
}
-static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {
default: return X86::COND_INVALID;
case X86::JE_4: return X86::COND_E;
@@ -2101,6 +2261,84 @@ static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
}
}
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::SETAr: case X86::SETAm: return X86::COND_A;
+ case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+ case X86::SETBr: case X86::SETBm: return X86::COND_B;
+ case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+ case X86::SETEr: case X86::SETEm: return X86::COND_E;
+ case X86::SETGr: case X86::SETGm: return X86::COND_G;
+ case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+ case X86::SETLr: case X86::SETLm: return X86::COND_L;
+ case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+ case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+ case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+ case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+ case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+ case X86::SETOr: case X86::SETOm: return X86::COND_O;
+ case X86::SETPr: case X86::SETPm: return X86::COND_P;
+ case X86::SETSr: case X86::SETSm: return X86::COND_S;
+ }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+ switch (Opc) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
+ case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
+ return X86::COND_A;
+ case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+ return X86::COND_AE;
+ case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
+ case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
+ return X86::COND_B;
+ case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+ return X86::COND_BE;
+ case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
+ case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
+ return X86::COND_E;
+ case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
+ case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
+ return X86::COND_G;
+ case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+ case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+ return X86::COND_GE;
+ case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
+ case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
+ return X86::COND_L;
+ case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+ case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+ return X86::COND_LE;
+ case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+ return X86::COND_NE;
+ case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+ case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+ return X86::COND_NO;
+ case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+ return X86::COND_NP;
+ case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+ case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+ return X86::COND_NS;
+ case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
+ case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
+ return X86::COND_O;
+ case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
+ case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
+ return X86::COND_P;
+ case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
+ case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
+ return X86::COND_S;
+ }
+}
+
unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Illegal condition code!");
@@ -2147,6 +2385,101 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
}
}
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: return X86::COND_INVALID;
+ case X86::COND_E: return X86::COND_E;
+ case X86::COND_NE: return X86::COND_NE;
+ case X86::COND_L: return X86::COND_G;
+ case X86::COND_LE: return X86::COND_GE;
+ case X86::COND_G: return X86::COND_L;
+ case X86::COND_GE: return X86::COND_LE;
+ case X86::COND_B: return X86::COND_A;
+ case X86::COND_BE: return X86::COND_AE;
+ case X86::COND_A: return X86::COND_B;
+ case X86::COND_AE: return X86::COND_BE;
+ }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+static unsigned getSETFromCond(X86::CondCode CC,
+ bool HasMemoryOperand) {
+ static const unsigned Opc[16][2] = {
+ { X86::SETAr, X86::SETAm },
+ { X86::SETAEr, X86::SETAEm },
+ { X86::SETBr, X86::SETBm },
+ { X86::SETBEr, X86::SETBEm },
+ { X86::SETEr, X86::SETEm },
+ { X86::SETGr, X86::SETGm },
+ { X86::SETGEr, X86::SETGEm },
+ { X86::SETLr, X86::SETLm },
+ { X86::SETLEr, X86::SETLEm },
+ { X86::SETNEr, X86::SETNEm },
+ { X86::SETNOr, X86::SETNOm },
+ { X86::SETNPr, X86::SETNPm },
+ { X86::SETNSr, X86::SETNSm },
+ { X86::SETOr, X86::SETOm },
+ { X86::SETPr, X86::SETPm },
+ { X86::SETSr, X86::SETSm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
+ bool HasMemoryOperand) {
+ static const unsigned Opc[32][3] = {
+ { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
+ { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+ { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
+ { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+ { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
+ { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
+ { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+ { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
+ { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+ { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+ { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+ { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+ { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+ { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
+ { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
+ { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
+ { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
+ { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+ { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
+ { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+ { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
+ { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
+ { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+ { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
+ { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+ { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+ { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+ { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+ { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+ { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
+ { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
+ { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
+ };
+
+ assert(CC < 16 && "Can only handle standard cond codes");
+ unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+ switch(RegBytes) {
+ default: llvm_unreachable("Illegal register size!");
+ case 2: return Opc[Idx][0];
+ case 4: return Opc[Idx][1];
+ case 8: return Opc[Idx][2];
+ }
+}
+
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
if (!MI->isTerminator()) return false;
@@ -2213,7 +2546,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
}
// Handle conditional branches.
- X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+ X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
if (BranchCode == X86::COND_INVALID)
return true; // Can't handle indirect branch.
@@ -2311,7 +2644,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
if (I->isDebugValue())
continue;
if (I->getOpcode() != X86::JMP_4 &&
- GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
break;
// Remove the branch.
I->eraseFromParent();
@@ -2371,6 +2704,56 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
return Count;
}
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg,
+ int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+ // Not all subtargets have cmov instructions.
+ if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+ return false;
+ if (Cond.size() != 1)
+ return false;
+ // We cannot do the composite conditions, at least not in SSA form.
+ if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+ return false;
+
+ // Check register classes.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC =
+ RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+ if (!RC)
+ return false;
+
+ // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+ if (X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR64RegClass.hasSubClassEq(RC)) {
+ // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+ // Bridge. Probably Ivy Bridge as well.
+ CondCycles = 2;
+ TrueCycles = 2;
+ FalseCycles = 2;
+ return true;
+ }
+
+ // Can't do vectors.
+ return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(Cond.size() == 1 && "Invalid Cond array");
+ unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+ MRI.getRegClass(DstReg)->getSize(),
+ false/*HasMemoryOperand*/);
+ BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
/// isHReg - Test if the given register is a physical h register.
static bool isHReg(unsigned Reg) {
return X86::GR8_ABCD_HRegClass.contains(Reg);
@@ -2637,6 +3020,305 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
NewMIs.push_back(MIB);
}
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+ int &CmpMask, int &CmpValue) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = MI->getOperand(1).getImm();
+ return true;
+ case X86::CMP64rr:
+ case X86::CMP32rr:
+ case X86::CMP16rr:
+ case X86::CMP8rr:
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = MI->getOperand(1).getReg();
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ SrcReg = MI->getOperand(0).getReg();
+ if (MI->getOperand(1).getReg() != SrcReg) return false;
+ // Compare against zero.
+ SrcReg2 = 0;
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
+ }
+ return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+ unsigned SrcReg2, int ImmValue,
+ MachineInstr *OI) {
+ if (((FlagI->getOpcode() == X86::CMP64rr &&
+ OI->getOpcode() == X86::SUB64rr) ||
+ (FlagI->getOpcode() == X86::CMP32rr &&
+ OI->getOpcode() == X86::SUB32rr)||
+ (FlagI->getOpcode() == X86::CMP16rr &&
+ OI->getOpcode() == X86::SUB16rr)||
+ (FlagI->getOpcode() == X86::CMP8rr &&
+ OI->getOpcode() == X86::SUB8rr)) &&
+ ((OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getReg() == SrcReg2) ||
+ (OI->getOperand(1).getReg() == SrcReg2 &&
+ OI->getOperand(2).getReg() == SrcReg)))
+ return true;
+
+ if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+ OI->getOpcode() == X86::SUB64ri32) ||
+ (FlagI->getOpcode() == X86::CMP64ri8 &&
+ OI->getOpcode() == X86::SUB64ri8) ||
+ (FlagI->getOpcode() == X86::CMP32ri &&
+ OI->getOpcode() == X86::SUB32ri) ||
+ (FlagI->getOpcode() == X86::CMP32ri8 &&
+ OI->getOpcode() == X86::SUB32ri8) ||
+ (FlagI->getOpcode() == X86::CMP16ri &&
+ OI->getOpcode() == X86::SUB16ri) ||
+ (FlagI->getOpcode() == X86::CMP16ri8 &&
+ OI->getOpcode() == X86::SUB16ri8) ||
+ (FlagI->getOpcode() == X86::CMP8ri &&
+ OI->getOpcode() == X86::SUB8ri)) &&
+ OI->getOperand(1).getReg() == SrcReg &&
+ OI->getOperand(2).getImm() == ImmValue)
+ return true;
+ return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+ case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
+ case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
+ case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
+ case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+ case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
+ case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
+ case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
+ case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+ case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
+ case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
+ case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
+ case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+ case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
+ case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
+ case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
+ case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
+ case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
+ case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
+ case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ return true;
+ }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+ int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const {
+ // Get the unique definition of SrcReg.
+ MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+ if (!MI) return false;
+
+ // CmpInstr is the first instruction of the BB.
+ MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+ // If we are comparing against zero, check whether we can use MI to update
+ // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+ bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+ if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
+ !isDefConvertible(MI)))
+ return false;
+
+ // We are searching for an earlier instruction that can make CmpInstr
+ // redundant and that instruction will be saved in Sub.
+ MachineInstr *Sub = NULL;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // We iterate backward, starting from the instruction before CmpInstr and
+ // stop when reaching the definition of a source register or done with the BB.
+ // RI points to the instruction before CmpInstr.
+ // If the definition is in this basic block, RE points to the definition;
+ // otherwise, RE is the rend of the basic block.
+ MachineBasicBlock::reverse_iterator
+ RI = MachineBasicBlock::reverse_iterator(I),
+ RE = CmpInstr->getParent() == MI->getParent() ?
+ MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+ CmpInstr->getParent()->rend();
+ MachineInstr *Movr0Inst = 0;
+ for (; RI != RE; ++RI) {
+ MachineInstr *Instr = &*RI;
+ // Check whether CmpInstr can be made redundant by the current instruction.
+ if (!IsCmpZero &&
+ isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+ Sub = Instr;
+ break;
+ }
+
+ if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+ Instr->readsRegister(X86::EFLAGS, TRI)) {
+ // This instruction modifies or uses EFLAGS.
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // They are safe to move up, if the definition to EFLAGS is dead and
+ // earlier instructions do not read or write EFLAGS.
+ if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
+ Instr->getOpcode() == X86::MOV16r0 ||
+ Instr->getOpcode() == X86::MOV32r0 ||
+ Instr->getOpcode() == X86::MOV64r0) &&
+ Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+ Movr0Inst = Instr;
+ continue;
+ }
+
+ // We can't remove CmpInstr.
+ return false;
+ }
+ }
+
+ // Return false if no candidates exist.
+ if (!IsCmpZero && !Sub)
+ return false;
+
+ bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
+
+ // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+ // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+ // If we are done with the basic block, we need to check whether EFLAGS is
+ // live-out.
+ bool IsSafe = false;
+ SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+ MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+ for (++I; I != E; ++I) {
+ const MachineInstr &Instr = *I;
+ if (Instr.modifiesRegister(X86::EFLAGS, TRI)) {
+ // It is safe to remove CmpInstr if EFLAGS is updated again.
+ IsSafe = true;
+ break;
+ }
+ if (!Instr.readsRegister(X86::EFLAGS, TRI))
+ continue;
+
+ // EFLAGS is used by this instruction.
+ X86::CondCode OldCC;
+ bool OpcIsSET = false;
+ if (IsCmpZero || IsSwapped) {
+ // We decode the condition code from opcode.
+ if (Instr.isBranch())
+ OldCC = getCondFromBranchOpc(Instr.getOpcode());
+ else {
+ OldCC = getCondFromSETOpc(Instr.getOpcode());
+ if (OldCC != X86::COND_INVALID)
+ OpcIsSET = true;
+ else
+ OldCC = getCondFromCMovOpc(Instr.getOpcode());
+ }
+ if (OldCC == X86::COND_INVALID) return false;
+ }
+ if (IsCmpZero) {
+ switch (OldCC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ // CF and OF are used, we can't perform this optimization.
+ return false;
+ }
+ } else if (IsSwapped) {
+ // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+ // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+ // We swap the condition code and synthesize the new opcode.
+ X86::CondCode NewCC = getSwappedCondition(OldCC);
+ if (NewCC == X86::COND_INVALID) return false;
+
+ // Synthesize the new opcode.
+ bool HasMemoryOperand = Instr.hasOneMemOperand();
+ unsigned NewOpc;
+ if (Instr.isBranch())
+ NewOpc = GetCondBranchFromCond(NewCC);
+ else if(OpcIsSET)
+ NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+ else {
+ unsigned DstReg = Instr.getOperand(0).getReg();
+ NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+ HasMemoryOperand);
+ }
+
+ // Push the MachineInstr to OpsToUpdate.
+ // If it is safe to remove CmpInstr, the condition code of these
+ // instructions will be modified.
+ OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+ }
+ if (Instr.killsRegister(X86::EFLAGS, TRI)) {
+ IsSafe = true;
+ break;
+ }
+ }
+
+ // If EFLAGS is not killed nor re-defined, we should check whether it is
+ // live-out. If it is live-out, do not optimize.
+ if ((IsCmpZero || IsSwapped) && !IsSafe) {
+ MachineBasicBlock *MBB = CmpInstr->getParent();
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ SE = MBB->succ_end(); SI != SE; ++SI)
+ if ((*SI)->isLiveIn(X86::EFLAGS))
+ return false;
+ }
+
+ // The instruction to be updated is either Sub or MI.
+ Sub = IsCmpZero ? MI : Sub;
+ // Move Movr0Inst to the place right before Sub.
+ if (Movr0Inst) {
+ Sub->getParent()->remove(Movr0Inst);
+ Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+ }
+
+ // Make sure Sub instruction defines EFLAGS.
+ assert(Sub->getNumOperands() >= 2 &&
+ Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
+ Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+ "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
+ Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+ CmpInstr->eraseFromParent();
+
+ // Modify the condition code of instructions in OpsToUpdate.
+ for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+ OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+ return true;
+}
+
/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
/// instruction with two undef reads of the register being defined. This is
/// used for mapping:
@@ -2809,7 +3491,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
return NULL;
bool NarrowToMOV32rm = false;
if (Size) {
- unsigned RCSize = getRegClass(MI->getDesc(), i, &RI)->getSize();
+ unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
if (Size < RCSize) {
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
@@ -3202,7 +3884,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
UnfoldStore &= FoldedStore;
const MCInstrDesc &MCID = get(Opc);
- const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
if (!MI->hasOneMemOperand() &&
RC == &X86::VR128RegClass &&
!TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
@@ -3297,7 +3979,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
// Emit the store instruction.
if (UnfoldStore) {
- const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
std::pair<MachineInstr::mmo_iterator,
MachineInstr::mmo_iterator> MMOs =
MF.extractStoreMemRefs(MI->memoperands_begin(),
@@ -3323,7 +4005,8 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
const MCInstrDesc &MCID = get(Opc);
- const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
unsigned NumDefs = MCID.NumDefs;
std::vector<SDValue> AddrOps;
std::vector<SDValue> BeforeOps;
@@ -3344,7 +4027,6 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
// Emit the load instruction.
SDNode *Load = 0;
- MachineFunction &MF = DAG.getMachineFunction();
if (FoldedLoad) {
EVT VT = *RC->vt_begin();
std::pair<MachineInstr::mmo_iterator,
@@ -3371,7 +4053,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
std::vector<EVT> VTs;
const TargetRegisterClass *DstRC = 0;
if (MCID.getNumDefs() > 0) {
- DstRC = getRegClass(MCID, 0, &RI);
+ DstRC = getRegClass(MCID, 0, &RI, MF);
VTs.push_back(*DstRC->vt_begin());
}
for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
@@ -3625,7 +4307,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
// Create the register. The code to initialize it is inserted
// later, by the CGBR pass (below).
MachineRegisterInfo &RegInfo = MF->getRegInfo();
- GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+ GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
X86FI->setGlobalBaseReg(GlobalBaseReg);
return GlobalBaseReg;
}
@@ -3835,7 +4517,7 @@ namespace {
unsigned PC;
if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
- PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+ PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
else
PC = GlobalBaseReg;
@@ -3869,3 +4551,117 @@ namespace {
char CGBR::ID = 0;
FunctionPass*
llvm::createGlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+ struct LDTLSCleanup : public MachineFunctionPass {
+ static char ID;
+ LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+ // No point folding accesses if there isn't at least two.
+ return false;
+ }
+
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+ return VisitNode(DT->getRootNode(), 0);
+ }
+
+ // Visit the dominator subtree rooted at Node in pre-order.
+ // If TLSBaseAddrReg is non-null, then use that to replace any
+ // TLS_base_addr instructions. Otherwise, create the register
+ // when the first such instruction is seen, and then use it
+ // as we encounter more instructions.
+ bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+ MachineBasicBlock *BB = Node->getBlock();
+ bool Changed = false;
+
+ // Traverse the current block.
+ for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ ++I) {
+ switch (I->getOpcode()) {
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ if (TLSBaseAddrReg)
+ I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+ else
+ I = SetRegister(I, &TLSBaseAddrReg);
+ Changed = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Visit the children of this block in the dominator tree.
+ for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+ I != E; ++I) {
+ Changed |= VisitNode(*I, TLSBaseAddrReg);
+ }
+
+ return Changed;
+ }
+
+ // Replace the TLS_base_addr instruction I with a copy from
+ // TLSBaseAddrReg, returning the new instruction.
+ MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+ unsigned TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF->getTarget());
+ const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+ const X86InstrInfo *TII = TM->getInstrInfo();
+
+ // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+ MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ is64Bit ? X86::RAX : X86::EAX)
+ .addReg(TLSBaseAddrReg);
+
+ // Erase the TLS_base_addr instruction.
+ I->eraseFromParent();
+
+ return Copy;
+ }
+
+ // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // inserting a copy instruction after I. Returns the new instruction.
+ MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+ MachineFunction *MF = I->getParent()->getParent();
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF->getTarget());
+ const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+ const X86InstrInfo *TII = TM->getInstrInfo();
+
+ // Create a virtual register for the TLS base address.
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+ ? &X86::GR64RegClass
+ : &X86::GR32RegClass);
+
+ // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+ MachineInstr *Next = I->getNextNode();
+ MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY),
+ *TLSBaseAddrReg)
+ .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+ return Copy;
+ }
+
+ virtual const char *getPassName() const {
+ return "Local Dynamic TLS Access Clean-up";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index b23d756..ec9b2e6 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -128,8 +128,8 @@ class X86InstrInfo : public X86GenInstrInfo {
X86TargetMachine &TM;
const X86RegisterInfo RI;
- /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
- /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+ /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
///
typedef DenseMap<unsigned,
std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
@@ -137,6 +137,7 @@ class X86InstrInfo : public X86GenInstrInfo {
RegOp2MemOpTableType RegOp2MemOpTable0;
RegOp2MemOpTableType RegOp2MemOpTable1;
RegOp2MemOpTableType RegOp2MemOpTable2;
+ RegOp2MemOpTableType RegOp2MemOpTable3;
/// MemOp2RegOpTable - Load / store unfolding opcode map.
///
@@ -144,9 +145,9 @@ class X86InstrInfo : public X86GenInstrInfo {
std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
MemOp2RegOpTableType MemOp2RegOpTable;
- void AddTableEntry(RegOp2MemOpTableType &R2MTable,
- MemOp2RegOpTableType &M2RTable,
- unsigned RegOp, unsigned MemOp, unsigned Flags);
+ static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+ MemOp2RegOpTableType &M2RTable,
+ unsigned RegOp, unsigned MemOp, unsigned Flags);
public:
explicit X86InstrInfo(X86TargetMachine &tm);
@@ -218,6 +219,14 @@ public:
MachineBasicBlock *FBB,
const SmallVectorImpl<MachineOperand> &Cond,
DebugLoc DL) const;
+ virtual bool canInsertSelect(const MachineBasicBlock&,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned, unsigned, int&, int&, int&) const;
+ virtual void insertSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, DebugLoc DL,
+ unsigned DstReg,
+ const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned TrueReg, unsigned FalseReg) const;
virtual void copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
@@ -363,6 +372,21 @@ public:
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI, unsigned UseIdx) const;
+ /// analyzeCompare - For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2,
+ int &CmpMask, int &CmpValue) const;
+
+ /// optimizeCompareInstr - Check if there exists an earlier instruction that
+ /// operates on the same source operands and sets flags in the same way as
+ /// Compare; remove Compare if possible.
+ virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int CmpMask, int CmpValue,
+ const MachineRegisterInfo *MRI) const;
+
private:
MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6a25312..d293156 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -63,6 +63,10 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
[SDTCisInt<0>,
SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>;
def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -95,6 +99,8 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
@@ -131,6 +137,11 @@ def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
+
+def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
+ [SDNPHasChain, SDNPSideEffect]>;
+
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
@@ -199,6 +210,9 @@ def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
[SDNPHasChain]>;
@@ -278,6 +292,20 @@ def X86Mem256AsmOperand : AsmOperandClass {
let Name = "Mem256"; let PredicateMethod = "isMem256";
}
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+ let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+}
+def X86MemVY32Operand : AsmOperandClass {
+ let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+}
+def X86MemVX64Operand : AsmOperandClass {
+ let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+}
+def X86MemVY64Operand : AsmOperandClass {
+ let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+}
+
def X86AbsMemAsmOperand : AsmOperandClass {
let Name = "AbsMem";
let SuperClasses = [X86MemAsmOperand];
@@ -316,6 +344,20 @@ def f128mem : X86MemOperand<"printf128mem"> {
let ParserMatchClass = X86Mem128AsmOperand; }
def f256mem : X86MemOperand<"printf256mem">{
let ParserMatchClass = X86Mem256AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+ let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+ let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY64Operand; }
}
// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -328,7 +370,7 @@ def i8mem_NOREX : Operand<i64> {
}
// GPRs available for tailcall.
-// It represents GR64_TC or GR64_TCW64.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
def ptr_rc_tailcall : PointerLikeRegClass<2>;
// Special i32mem for addresses of load folding tail calls. These are not
@@ -336,7 +378,8 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>;
// after callee-saved register are popped.
def i32mem_TC : Operand<i32> {
let PrintMethod = "printi32mem";
- let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm);
+ let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+ i32imm, i8imm);
let ParserMatchClass = X86Mem32AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
@@ -487,6 +530,9 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
[tglobaltlsaddr], []>;
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
[add, sub, mul, X86mul_imm, shl, or, frameindex,
X86WrapperRIP], []>;
@@ -494,6 +540,9 @@ def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
[tglobaltlsaddr], []>;
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
//===----------------------------------------------------------------------===//
// X86 Instruction Predicate Definitions.
def HasCMov : Predicate<"Subtarget->hasCMov()">;
@@ -514,8 +563,8 @@ def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
-def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">;
-def HasFMA3 : Predicate<"Subtarget->hasFMA3()">;
+def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA : Predicate<"Subtarget->hasFMA()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
@@ -680,25 +729,27 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
// Nop
let neverHasSideEffects = 1 in {
- def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
- "nop{w}\t$zero", []>, TB, OpSize;
+ "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
- "nop{l}\t$zero", []>, TB;
+ "nop{l}\t$zero", [], IIC_NOP>, TB;
}
// Constructing a stack frame.
def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
- "enter\t$len, $lvl", []>;
+ "enter\t$len, $lvl", [], IIC_ENTER>;
let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
def LEAVE : I<0xC9, RawFrm,
- (outs), (ins), "leave", []>, Requires<[In32BitMode]>;
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In32BitMode]>;
let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
def LEAVE64 : I<0xC9, RawFrm,
- (outs), (ins), "leave", []>, Requires<[In64BitMode]>;
+ (outs), (ins), "leave", [], IIC_LEAVE>,
+ Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
@@ -706,41 +757,49 @@ def LEAVE64 : I<0xC9, RawFrm,
let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
let mayLoad = 1 in {
-def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
- OpSize;
-def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
- OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
- OpSize;
-def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
-
-def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize;
-def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>,
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG16>, OpSize;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+ IIC_POP_REG>, OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+ IIC_POP_MEM>, OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+ IIC_POP_REG>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+ IIC_POP_MEM>;
+
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
Requires<[In32BitMode]>;
}
let mayStore = 1 in {
-def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
- OpSize;
-def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
- OpSize;
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+ IIC_PUSH_REG>, OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+ IIC_PUSH_MEM>,
OpSize;
-def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+ IIC_PUSH_REG>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+ IIC_PUSH_MEM>;
def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
- "push{l}\t$imm", []>;
+ "push{l}\t$imm", [], IIC_PUSH_IMM>;
def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
- "push{w}\t$imm", []>, OpSize;
+ "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
- "push{l}\t$imm", []>;
+ "push{l}\t$imm", [], IIC_PUSH_IMM>;
-def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize;
-def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+ OpSize;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
Requires<[In32BitMode]>;
}
@@ -749,44 +808,48 @@ def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
let mayLoad = 1 in {
def POP64r : I<0x58, AddRegFrm,
- (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+ (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+ IIC_POP_REG>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+ IIC_POP_MEM>;
}
let mayStore = 1 in {
def PUSH64r : I<0x50, AddRegFrm,
- (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+ (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+ IIC_PUSH_REG>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+ IIC_PUSH_MEM>;
}
}
let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
- "push{q}\t$imm", []>;
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
- "push{q}\t$imm", []>;
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
- "push{q}\t$imm", []>;
+ "push{q}\t$imm", [], IIC_PUSH_IMM>;
}
let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
-def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
Requires<[In64BitMode]>;
let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
-def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
Requires<[In64BitMode]>;
let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
mayLoad=1, neverHasSideEffects=1 in {
-def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>,
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
Requires<[In32BitMode]>;
}
let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>,
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
Requires<[In32BitMode]>;
}
@@ -794,84 +857,92 @@ let Constraints = "$src = $dst" in { // GR32 = bswap GR32
def BSWAP32r : I<0xC8, AddRegFrm,
(outs GR32:$dst), (ins GR32:$src),
"bswap{l}\t$dst",
- [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+ [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
"bswap{q}\t$dst",
- [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+ [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
} // Constraints = "$src = $dst"
// Bit scan instructions.
let Defs = [EFLAGS] in {
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize;
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+ IIC_BSF>, TB, OpSize;
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB,
- OpSize;
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+ IIC_BSF>, TB, OpSize;
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB;
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+ IIC_BSF>, TB;
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+ IIC_BSF>, TB;
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+ IIC_BSF>, TB;
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize;
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+ TB, OpSize;
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB,
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+ IIC_BSR>, TB,
OpSize;
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB;
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+ IIC_BSR>, TB;
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB;
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+ IIC_BSR>, TB;
} // Defs = [EFLAGS]
// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>;
-def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
}
// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>;
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize;
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>;
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
-def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>;
-def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>;
-def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
//===----------------------------------------------------------------------===//
@@ -880,64 +951,64 @@ def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
let neverHasSideEffects = 1 in {
def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
- "mov{b}\t{$src, $dst|$dst, $src}", []>;
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(set GR8:$dst, imm:$src)]>;
+ [(set GR8:$dst, imm:$src)], IIC_MOV>;
def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, imm:$src)]>, OpSize;
+ [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, imm:$src)]>;
+ [(set GR32:$dst, imm:$src)], IIC_MOV>;
def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
"movabs{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, imm:$src)]>;
+ [(set GR64:$dst, imm:$src)], IIC_MOV>;
def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, i64immSExt32:$src)]>;
+ [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
}
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 imm:$src), addr:$dst)]>;
+ [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+ [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 imm:$src), addr:$dst)]>;
+ [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32:$src, addr:$dst)]>;
+ [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a
/// 32-bit offset from the PC. These are only valid in x86-32 mode.
def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
- "mov{b}\t{$src, %al|AL, $src}", []>,
+ "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
- "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize,
+ "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
- "mov{l}\t{$src, %eax|EAX, $src}", []>,
+ "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
- "mov{b}\t{%al, $dst|$dst, AL}", []>,
+ "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
- "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize,
+ "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
- "mov{l}\t{%eax, $dst|$dst, EAX}", []>,
+ "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
// FIXME: These definitions are utterly broken
@@ -958,42 +1029,42 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
let isCodeGenOnly = 1 in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
- "mov{b}\t{$src, $dst|$dst, $src}", []>;
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
}
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(set GR8:$dst, (loadi8 addr:$src))]>;
+ [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+ [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (loadi32 addr:$src))]>;
+ [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (load addr:$src))]>;
+ [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
}
def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store GR8:$src, addr:$dst)]>;
+ [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store GR16:$src, addr:$dst)]>, OpSize;
+ [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store GR32:$src, addr:$dst)]>;
+ [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store GR64:$src, addr:$dst)]>;
+ [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
// that they can be used for copying and storing h registers, which can't be
@@ -1002,24 +1073,28 @@ let isCodeGenOnly = 1 in {
let neverHasSideEffects = 1 in
def MOV8rr_NOREX : I<0x88, MRMDestReg,
(outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>;
let mayStore = 1 in
def MOV8mr_NOREX : I<0x88, MRMDestMem,
(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>;
let mayLoad = 1, neverHasSideEffects = 1,
canFoldAsLoad = 1, isReMaterializable = 1 in
def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
- "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [],
+ IIC_MOV_MEM>;
}
// Condition code ops, incl. set if equal/not equal/...
-let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
-def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>; // flags = AH
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
+ [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
-def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
+ IIC_AHF>; // AH = flags
//===----------------------------------------------------------------------===//
@@ -1028,13 +1103,14 @@ def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags
let Defs = [EFLAGS] in {
def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB;
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+ OpSize, TB;
def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
// Unlike with the register+register form, the memory+register form of the
// bt instruction does not ignore the high bits of the index. From ISel's
@@ -1045,31 +1121,33 @@ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
// [(X86bt (loadi16 addr:$src1), GR16:$src2),
// (implicit EFLAGS)]
- []
+ [], IIC_BT_MR
>, OpSize, TB, Requires<[FastBTMem]>;
def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
// [(X86bt (loadi32 addr:$src1), GR32:$src2),
// (implicit EFLAGS)]
- []
+ [], IIC_BT_MR
>, TB, Requires<[FastBTMem]>;
def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
// [(X86bt (loadi64 addr:$src1), GR64:$src2),
// (implicit EFLAGS)]
- []
+ [], IIC_BT_MR
>, TB;
def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
- OpSize, TB;
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+ IIC_BT_RI>, OpSize, TB;
def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+ IIC_BT_RI>, TB;
def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+ IIC_BT_RI>, TB;
// Note that these instructions don't need FastBTMem because that
// only applies when the other operand is in a register. When it's
@@ -1077,91 +1155,103 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
- ]>, OpSize, TB;
+ ], IIC_BT_MI>, OpSize, TB;
def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
- ]>, TB;
+ ], IIC_BT_MI>, TB;
def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
- i64immSExt8:$src2))]>, TB;
+ i64immSExt8:$src2))], IIC_BT_MI>, TB;
def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+ OpSize, TB;
def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+ OpSize, TB;
def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+ OpSize, TB;
def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
- "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+ "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+ OpSize, TB;
def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
- "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
} // Defs = [EFLAGS]
@@ -1175,89 +1265,106 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
let Constraints = "$val = $dst" in {
def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
"xchg{b}\t{$val, $ptr|$ptr, $val}",
- [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+ [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
+ IIC_XCHG_MEM>;
def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
"xchg{w}\t{$val, $ptr|$ptr, $val}",
- [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>,
+ [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
+ IIC_XCHG_MEM>,
OpSize;
def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
"xchg{l}\t{$val, $ptr|$ptr, $val}",
- [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+ [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
+ IIC_XCHG_MEM>;
def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
"xchg{q}\t{$val, $ptr|$ptr, $val}",
- [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+ [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
+ IIC_XCHG_MEM>;
def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
- "xchg{b}\t{$val, $src|$src, $val}", []>;
+ "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
- "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize;
+ "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
- "xchg{l}\t{$val, $src|$src, $val}", []>;
+ "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
- "xchg{q}\t{$val, $src|$src, $val}", []>;
+ "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
}
def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
- "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize;
+ "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>;
+ "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ Requires<[In32BitMode]>;
// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>;
+ "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ Requires<[In64BitMode]>;
def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
- "xchg{q}\t{$src, %rax|RAX, $src}", []>;
+ "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
- "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+ OpSize;
def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
let mayLoad = 1, mayStore = 1 in {
def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
- "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
- "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+ OpSize;
def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
}
def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
- "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG8>, TB;
def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
- "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB, OpSize;
def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_REG>, TB;
let mayLoad = 1, mayStore = 1 in {
def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
- "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM8>, TB;
def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
- "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB, OpSize;
def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+ IIC_CMPXCHG_MEM>, TB;
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
- "cmpxchg8b\t$dst", []>, TB;
+ "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
- "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;
+ "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+ TB, Requires<[HasCmpxchg16b]>;
@@ -1281,69 +1388,75 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
// String manipulation instructions
-def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
-def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
-def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>;
-def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
-def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>;
-def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize;
-def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>;
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
// Flag instructions
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
-def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
-def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
// Table lookup instructions
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>;
// ASCII Adjust After Addition
// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, Requires<[In32BitMode]>;
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+ Requires<[In32BitMode]>;
// ASCII Adjust AX Before Division
// sets AL, AH and EFLAGS and uses AL and AH
def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
- "aad\t$src", []>, Requires<[In32BitMode]>;
+ "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
// ASCII Adjust AX After Multiply
// sets AL, AH and EFLAGS and uses AL
def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
- "aam\t$src", []>, Requires<[In32BitMode]>;
+ "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
// ASCII Adjust AL After Subtraction - sets
// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, Requires<[In32BitMode]>;
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+ Requires<[In32BitMode]>;
// Decimal Adjust AL after Addition
// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, Requires<[In32BitMode]>;
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+ Requires<[In32BitMode]>;
// Decimal Adjust AL after Subtraction
// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, Requires<[In32BitMode]>;
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+ Requires<[In32BitMode]>;
// Check Array Index Against Bounds
def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "bound\t{$src, $dst|$dst, $src}", []>, OpSize,
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
Requires<[In32BitMode]>;
def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "bound\t{$src, $dst|$dst, $src}", []>,
+ "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
Requires<[In32BitMode]>;
// Adjust RPL Field of Segment Selector
def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst),
- "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+ Requires<[In32BitMode]>;
def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
- "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+ "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+ Requires<[In32BitMode]>;
//===----------------------------------------------------------------------===//
// MOVBE Instructions
@@ -1351,22 +1464,28 @@ def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
let Predicates = [HasMOVBE] in {
def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8;
+ [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+ OpSize, T8;
def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"movbe{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8;
+ [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+ T8;
def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"movbe{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8;
+ [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+ T8;
def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"movbe{w}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8;
+ [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+ OpSize, T8;
def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movbe{l}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR32:$src), addr:$dst)]>, T8;
+ [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+ T8;
def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movbe{q}\t{$src, $dst|$dst, $src}",
- [(store (bswap GR64:$src), addr:$dst)]>, T8;
+ [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+ T8;
}
//===----------------------------------------------------------------------===//
@@ -1374,11 +1493,14 @@ let Predicates = [HasMOVBE] in {
//
let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
- "rdrand{w}\t$dst", []>, OpSize, TB;
+ "rdrand{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
- "rdrand{l}\t$dst", []>, TB;
+ "rdrand{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
- "rdrand{q}\t$dst", []>, TB;
+ "rdrand{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
}
//===----------------------------------------------------------------------===//
@@ -1774,9 +1896,9 @@ def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
// We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW8r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW8r)>;
-def : InstAlias<"fnstsw" , (FNSTSW8r)>;
+def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
+def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw" , (FNSTSW16r)>;
// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
// this is compatible with what GAS does.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 63f96b6..e4edd36 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -20,71 +20,130 @@
// MMX Multiclasses
//===----------------------------------------------------------------------===//
+def MMX_INTALU_ITINS : OpndItins<
+ IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+ IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+ IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+ IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+
+def MMX_PMUL_ITINS : OpndItins<
+ IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+def MMX_PSADBW_ITINS : OpndItins<
+ IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+ IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+ IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+def MMX_UNPCK_H_ITINS : OpndItins<
+ IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+ IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+ IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+ IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+
+def MMX_CVT_PD_ITINS : OpndItins<
+ IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+ IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+
let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
// When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
- bit Commutable = 0> {
+ OpndItins itins, bit Commutable = 0> {
def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> {
let isCommutable = Commutable;
}
def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>;
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>;
}
multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
string OpcodeStr, Intrinsic IntId,
- Intrinsic IntId2> {
+ Intrinsic IntId2, ShiftOpndItins itins> {
def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>;
def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>;
+ (bitconvert (load_mmx addr:$src2))))],
+ itins.rm>;
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+ [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>;
}
}
/// Unary MMX instructions requiring SSSE3.
multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
- Intrinsic IntId64> {
+ Intrinsic IntId64, OpndItins itins> {
def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR64:$dst, (IntId64 VR64:$src))]>;
+ [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR64:$dst,
- (IntId64 (bitconvert (memopmmx addr:$src))))]>;
+ (IntId64 (bitconvert (memopmmx addr:$src))))],
+ itins.rm>;
}
/// Binary MMX instructions requiring SSSE3.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
- Intrinsic IntId64> {
+ Intrinsic IntId64, OpndItins itins> {
let isCommutable = 0 in
def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst,
(IntId64 VR64:$src1,
- (bitconvert (memopmmx addr:$src2))))]>;
+ (bitconvert (memopmmx addr:$src2))))], itins.rm>;
}
}
@@ -103,13 +162,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
- string asm, Domain d> {
+ string asm, OpndItins itins, Domain d> {
def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
[(set DstRC:$dst, (Int SrcRC:$src))],
- IIC_DEFAULT, d>;
+ itins.rr, d>;
def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
[(set DstRC:$dst, (Int (ld_frag addr:$src)))],
- IIC_DEFAULT, d>;
+ itins.rm, d>;
}
multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -139,22 +198,24 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (scalar_to_vector GR32:$src)))]>;
+ (x86mmx (scalar_to_vector GR32:$src)))],
+ IIC_MMX_MOV_MM_RM>;
let canFoldAsLoad = 1 in
def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set VR64:$dst,
- (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>;
+ [(set VR64:$dst,
+ (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+ IIC_MMX_MOV_MM_RM>;
let mayStore = 1 in
def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
- "movd\t{$src, $dst|$dst, $src}", []>;
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
- "movd\t{$src, $dst|$dst, $src}", []>;
+ "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>;
let neverHasSideEffects = 1 in
def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
"movd\t{$src, $dst|$dst, $src}",
- []>;
+ [], IIC_MMX_MOV_MM_RM>;
// These are 64 bit moves, but since the OS X assembler doesn't
// recognize a register-register movq, we write them as
@@ -163,197 +224,276 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
(outs GR64:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR64:$dst,
- (bitconvert VR64:$src))]>;
+ (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (bitconvert GR64:$src))]>;
+ (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
let neverHasSideEffects = 1 in
def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
- "movq\t{$src, $dst|$dst, $src}", []>;
+ "movq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
let canFoldAsLoad = 1 in
def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set VR64:$dst, (load_mmx addr:$src))]>;
+ [(set VR64:$dst, (load_mmx addr:$src))],
+ IIC_MMX_MOVQ_RM>;
def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (x86mmx VR64:$src), addr:$dst)]>;
+ [(store (x86mmx VR64:$src), addr:$dst)],
+ IIC_MMX_MOVQ_RM>;
def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (bitconvert
(i64 (vector_extract (v2i64 VR128:$src),
- (iPTR 0))))))]>;
+ (iPTR 0))))))],
+ IIC_MMX_MOVQ_RR>;
def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
(ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (scalar_to_vector
- (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+ (i64 (bitconvert (x86mmx VR64:$src))))))],
+ IIC_MMX_MOVQ_RR>;
let neverHasSideEffects = 1 in
def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
- (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>;
+ (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
- (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>;
+ (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
+ IIC_MMX_MOVQ_RR>;
def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movntq\t{$src, $dst|$dst, $src}",
- [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+ IIC_MMX_MOVQ_RM>;
let AddedComplexity = 15 in
// movd to MMX register zero-extends
def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>;
+ (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
+ IIC_MMX_MOV_MM_RM>;
let AddedComplexity = 20 in
def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
(ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (X86vzmovl (x86mmx
- (scalar_to_vector (loadi32 addr:$src))))))]>;
+ (scalar_to_vector (loadi32 addr:$src))))))],
+ IIC_MMX_MOV_MM_RM>;
// Arithmetic Instructions
-defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>;
-defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>;
-defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>;
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+ MMX_INTALU_ITINS>;
// -- Addition
-defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>;
-defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>;
-defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>;
-defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>;
-defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
-defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
-
-defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
-defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
-
-defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>;
-defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>;
-defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>;
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+ MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+ MMX_PHADDSUBW>;
// -- Subtraction
-defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>;
-defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>;
-defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>;
-defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>;
-
-defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
-defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
-
-defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
-defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
-
-defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>;
-defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>;
-defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>;
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+ MMX_INTALUQ_ITINS, 1>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+ MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+ MMX_PHADDSUBW>;
+defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+ MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+ MMX_PHADDSUBW>;
// -- Multiplication
-defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>;
-
-defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>;
-defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
-defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+ MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
+ MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+ MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+ MMX_PMUL_ITINS, 1>;
let isCommutable = 1 in
defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
- int_x86_ssse3_pmul_hr_sw>;
+ int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
// -- Miscellanea
-defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+ MMX_PMUL_ITINS, 1>;
defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
- int_x86_ssse3_pmadd_ub_sw>;
-defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
-defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
-
-defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
-defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
-
-defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
-defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
-
-defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
-
-defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>;
-defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>;
-defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>;
+ int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+ MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+ MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+ MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+ MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+ MMX_MISC_FUNC_ITINS>;
let Constraints = "$src1 = $dst" in
defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
// Logical Instructions
-defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
-defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>;
-defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
-defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>;
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+ MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+ MMX_INTALU_ITINS>;
// Shift Instructions
defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
- int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+ MMX_SHIFT_ITINS>;
defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
- int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+ MMX_SHIFT_ITINS>;
defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
- int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+ MMX_SHIFT_ITINS>;
defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
- int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+ MMX_SHIFT_ITINS>;
defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
- int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+ MMX_SHIFT_ITINS>;
defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
- int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+ MMX_SHIFT_ITINS>;
defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
- int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+ MMX_SHIFT_ITINS>;
defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
- int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+ MMX_SHIFT_ITINS>;
// Comparison Instructions
-defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
-defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
-defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
-
-defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
-defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
-defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+ MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+ MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+ MMX_INTALU_ITINS>;
// -- Unpack Instructions
defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
- int_x86_mmx_punpckhbw>;
+ int_x86_mmx_punpckhbw,
+ MMX_UNPCK_H_ITINS>;
defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
- int_x86_mmx_punpckhwd>;
+ int_x86_mmx_punpckhwd,
+ MMX_UNPCK_H_ITINS>;
defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
- int_x86_mmx_punpckhdq>;
+ int_x86_mmx_punpckhdq,
+ MMX_UNPCK_H_ITINS>;
defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
- int_x86_mmx_punpcklbw>;
+ int_x86_mmx_punpcklbw,
+ MMX_UNPCK_L_ITINS>;
defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
- int_x86_mmx_punpcklwd>;
+ int_x86_mmx_punpcklwd,
+ MMX_UNPCK_L_ITINS>;
defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
- int_x86_mmx_punpckldq>;
+ int_x86_mmx_punpckldq,
+ MMX_UNPCK_L_ITINS>;
// -- Pack Instructions
-defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
-defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
-defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+ MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+ MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+ MMX_PCK_ITINS>;
// -- Shuffle Instructions
-defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>;
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+ MMX_PSHUF_ITINS>;
def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
- (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>;
+ (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+ IIC_MMX_PSHUF>;
def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
- imm:$src2))]>;
-
+ imm:$src2))],
+ IIC_MMX_PSHUF>;
@@ -361,24 +501,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
// -- Conversion Instructions
defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
- SSEPackedSingle>, TB;
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
- SSEPackedDouble>, TB, OpSize;
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
- SSEPackedSingle>, TB;
+ MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
- SSEPackedDouble>, TB, OpSize;
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
- SSEPackedDouble>, TB, OpSize;
+ MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
let Constraints = "$src1 = $dst" in {
defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
int_x86_sse_cvtpi2ps,
i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, TB;
+ SSEPackedSingle>, TB;
}
// Extract / Insert
@@ -386,14 +526,16 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
(outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
- (iPTR imm:$src2)))]>;
+ (iPTR imm:$src2)))],
+ IIC_MMX_PEXTR>;
let Constraints = "$src1 = $dst" in {
def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
(outs VR64:$dst),
(ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
- GR32:$src2, (iPTR imm:$src3)))]>;
+ GR32:$src2, (iPTR imm:$src3)))],
+ IIC_MMX_PINSRW>;
def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
@@ -401,7 +543,8 @@ let Constraints = "$src1 = $dst" in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
(i32 (anyext (loadi16 addr:$src2))),
- (iPTR imm:$src3)))]>;
+ (iPTR imm:$src3)))],
+ IIC_MMX_PINSRW>;
}
// Mask creation
@@ -439,11 +582,13 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
let Uses = [EDI] in
def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
- [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+ IIC_MMX_MASKMOV>;
let Uses = [RDI] in
def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
- [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+ IIC_MMX_MASKMOV>;
// 64-bit bit convert.
def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 65e3c1e..c2d169a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1418,10 +1418,10 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d, OpndItins itins> {
- def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+ def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
[(set DstRC:$dst, (OpNode SrcRC:$src))],
itins.rr, d>;
- def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+ def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
[(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
itins.rm, d>;
}
@@ -1622,7 +1622,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
"cvttsd2si{q}", SSE_CVT_SD2SI>,
XD, REX_W;
-let Pattern = []<dag> in {
+let Pattern = []<dag>, neverHasSideEffects = 1 in {
defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
"cvtss2si{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
@@ -1630,14 +1630,16 @@ defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
"cvtss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
- "cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>, TB, VEX,
+ Requires<[HasAVX]>;
defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
- "cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
+ "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+ SSEPackedSingle, SSE_CVT_PS>, TB, VEX,
+ Requires<[HasAVX]>;
}
-let Pattern = []<dag> in {
+let Pattern = []<dag>, neverHasSideEffects = 1 in {
defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
"cvtss2si{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS;
@@ -1646,8 +1648,8 @@ defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>,
- TB; /* PD SSE3 form is avaiable */
+ SSEPackedSingle, SSE_CVT_PS>, TB,
+ Requires<[HasSSE2]>;
}
let Predicates = [HasAVX] in {
@@ -1788,57 +1790,6 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
Requires<[HasSSE2]>;
}
-// Convert doubleword to packed single/double fp
-// SSE2 instructions without OpSize prefix
-def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
- IIC_SSE_CVT_PS_RR>,
- TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vcvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
- (bitconvert (memopv2i64 addr:$src))))],
- IIC_SSE_CVT_PS_RM>,
- TB, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
- IIC_SSE_CVT_PS_RR>,
- TB, Requires<[HasSSE2]>;
-def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "cvtdq2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
- (bitconvert (memopv2i64 addr:$src))))],
- IIC_SSE_CVT_PS_RM>,
- TB, Requires<[HasSSE2]>;
-
-// FIXME: why the non-intrinsic version is described as SSE3?
-// SSE2 instructions with XS prefix
-def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- XS, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
- (bitconvert (memopv2i64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>,
- XS, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- XS, Requires<[HasSSE2]>;
-def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
- (bitconvert (memopv2i64 addr:$src))))],
- IIC_SSE_CVT_PD_RM>,
- XS, Requires<[HasSSE2]>;
-
-
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
@@ -1859,51 +1810,63 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RM>;
-def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>,
- VEX;
-def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
- (ins f128mem:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq
- (memop addr:$src)))],
- IIC_SSE_CVT_PS_RM>, VEX;
-def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
- IIC_SSE_CVT_PS_RR>;
-def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2dq
- (memop addr:$src)))],
- IIC_SSE_CVT_PS_RM>;
-
-// SSE2 packed instructions with XD prefix
-def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- XD, VEX, Requires<[HasAVX]>;
-def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
- (memop addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- XD, VEX, Requires<[HasAVX]>;
-def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- XD, Requires<[HasSSE2]>;
-def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
- (memop addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- XD, Requires<[HasSSE2]>;
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_cvtps2dq VR128:$src),
+ (VCVTPS2DQrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)),
+ (VCVTPS2DQrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_cvtps2dq VR128:$src),
+ (CVTPS2DQrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)),
+ (CVTPS2DQrm addr:$src)>;
+}
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+ "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+}
+
+def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>;
+def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src),
+ (VCVTPD2DQrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)),
+ (VCVTPD2DQXrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src),
+ (CVTPD2DQrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)),
+ (CVTPD2DQrm addr:$src)>;
+}
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
@@ -1915,7 +1878,7 @@ def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
- (memop addr:$src)))],
+ (memopv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX;
def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1936,14 +1899,19 @@ def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq (memop addr:$src)))],
+ (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>;
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (Int_VCVTDQ2PSrr VR128:$src)>;
+ (VCVTDQ2PSrr VR128:$src)>;
def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
- (Int_VCVTDQ2PSrm addr:$src)>;
+ (VCVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (VCVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+ (VCVTDQ2PSrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(VCVTTPS2DQrr VR128:$src)>;
@@ -1963,9 +1931,14 @@ let Predicates = [HasAVX] in {
let Predicates = [HasSSE2] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (Int_CVTDQ2PSrr VR128:$src)>;
+ (CVTDQ2PSrr VR128:$src)>;
def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
- (Int_CVTDQ2PSrm addr:$src)>;
+ (CVTDQ2PSrm addr:$src)>;
+
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+ (CVTDQ2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+ (CVTDQ2PSrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(CVTTPS2DQrr VR128:$src)>;
@@ -1978,12 +1951,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
[(set VR128:$dst,
(int_x86_sse2_cvttpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>, VEX;
-let isCodeGenOnly = 1 in
-def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memop addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX;
+
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
@@ -1991,31 +1959,38 @@ def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memop addr:$src)))],
+ (memopv2f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
-def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dq\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
-def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
+ "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memopv2f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
- "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+ "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+ (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
// Convert packed single to packed double
let Predicates = [HasAVX] in {
@@ -2033,35 +2008,71 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, TB, VEX;
}
+
+let Predicates = [HasSSE2] in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, TB;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, TB;
+}
-def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd
- (load addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- TB, VEX, Requires<[HasAVX]>;
-def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
- IIC_SSE_CVT_PD_RR>,
- TB, Requires<[HasSSE2]>;
-def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtps2pd
- (load addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- TB, Requires<[HasSSE2]>;
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_cvtps2pd VR128:$src),
+ (VCVTPS2PDrr VR128:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_cvtps2pd VR128:$src),
+ (CVTPS2PDrr VR128:$src)>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+def VCVTDQ2PDrm : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDrr : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDYrm : SSDI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDYrr : SSDI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
+def CVTDQ2PDrm : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RR>;
+def CVTDQ2PDrr : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ IIC_SSE_CVT_PD_RM>;
+
+// 128 bit register conversion intrinsics
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src),
+ (VCVTDQ2PDrr VR128:$src)>;
+
+let Predicates = [HasSSE2] in
+def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src),
+ (CVTDQ2PDrr VR128:$src)>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
+ (VCVTDQ2PDYrr VR128:$src)>;
+ def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
+ (VCVTDQ2PDYrm addr:$src)>;
+
+ def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
+ (VCVTPD2DQYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
+ (VCVTPD2DQYrm addr:$src)>;
+
+ def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+ (VCVTDQ2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+ (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
// Convert packed double to packed single
// The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -2070,25 +2081,24 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
-def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
-def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2psx\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
- "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+ "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>;
@@ -2097,64 +2107,60 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
IIC_SSE_CVT_PD_RM>;
-def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
- IIC_SSE_CVT_PD_RR>;
-def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
- (ins f128mem:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
- (memop addr:$src)))],
- IIC_SSE_CVT_PD_RM>;
-def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
- IIC_SSE_CVT_PD_RR>;
-def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
- (memop addr:$src)))],
- IIC_SSE_CVT_PD_RM>;
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src),
+ (VCVTPD2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)),
+ (VCVTPD2PSXrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src),
+ (CVTPD2PSrr VR128:$src)>;
+ def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)),
+ (CVTPD2PSrm addr:$src)>;
+}
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
// whenever possible to avoid declaring two versions of each one.
-def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
- (VCVTDQ2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
- (VCVTDQ2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
- (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
- (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
- (VCVTPS2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
- (VCVTPS2DQYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
- (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
- (VCVTPS2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
- (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
- (VCVTTPD2DQYrm addr:$src)>;
-
-// Match fround and fextend for 128/256-bit conversions
-def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
- (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
- (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
- (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
- (VCVTPS2PDYrm addr:$src)>;
+let Predicates = [HasAVX] in {
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+ (VCVTDQ2PSYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+ (VCVTDQ2PSYrm addr:$src)>;
+
+ def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
+ (VCVTPD2PSYrm addr:$src)>;
+
+ def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
+ (VCVTPS2DQYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
+ (VCVTPS2DQYrm addr:$src)>;
+
+ def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
+ (VCVTPS2PDYrr VR128:$src)>;
+ def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
+ (VCVTPS2PDYrm addr:$src)>;
+
+ def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
+ (VCVTTPD2DQYrr VR256:$src)>;
+ def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
+ (VCVTTPD2DQYrm addr:$src)>;
+
+ // Match fround and fextend for 128/256-bit conversions
+ def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+ (VCVTPD2PSYrr VR256:$src)>;
+ def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+ (VCVTPD2PSYrm addr:$src)>;
+
+ def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+ (VCVTPS2PDYrr VR128:$src)>;
+ def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+ (VCVTPS2PDYrm addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Compare Instructions
@@ -3336,13 +3342,6 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
IIC_SSE_MOVNT>, VEX;
}
-def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
- (VMOVNTDQYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
- (VMOVNTPDYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
- (VMOVNTPSYmr addr:$dst, VR256:$src)>;
-
let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
@@ -4610,7 +4609,7 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
// Bitcast FR64 <-> GR64
//
let Predicates = [HasAVX] in
-def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def VMOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
VEX;
@@ -4623,7 +4622,7 @@ def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, VEX;
-def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def MOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
IIC_SSE_MOVDQ>;
@@ -4897,80 +4896,6 @@ def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
//===---------------------------------------------------------------------===//
-// SSE3 - Conversion Instructions
-//===---------------------------------------------------------------------===//
-
-// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX] in {
-// The assembler can recognize rr 256-bit instructions by seeing a ymm
-// register, but the same isn't true when using memory operands instead.
-// Provide other assembly rr and rm forms to address this explicitly.
-def VCVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrYr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// XMM only
-def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// YMM only
-def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
- "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
-
-def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RM>;
-def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>;
-
-def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
- (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
- (VCVTTPD2DQYrm addr:$src)>;
-
-// Convert Packed DW Integers to Packed Double FP
-let Predicates = [HasAVX] in {
-def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrm : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
- "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-}
-
-def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RR>;
-def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_CVT_PD_RM>;
-
-// AVX 256-bit register conversion intrinsics
-def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
- (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
- (VCVTDQ2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
- (VCVTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
- (VCVTPD2DQYrm addr:$src)>;
-
-def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
- (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
- (VCVTDQ2PDYrm addr:$src)>;
-
-//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
@@ -5730,14 +5655,26 @@ let Predicates = [HasSSE41] in {
(PMOVZXDQrm addr:$src)>;
}
+let Predicates = [HasAVX2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+ (VPMOVZXDQYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+ (VPMOVZXWDYrr VR128:$src)>;
+ }
+
+ def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
let Predicates = [HasAVX] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
}
let Predicates = [HasSSE41] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
}
@@ -6608,15 +6545,15 @@ let Predicates = [HasAVX] in {
let isCommutable = 0 in {
let ExeDomain = SSEPackedSingle in {
defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
- VR128, memopv4f32, i128mem, 0>, VEX_4V;
+ VR128, memopv4f32, f128mem, 0>, VEX_4V;
defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
- int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V;
+ int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
}
let ExeDomain = SSEPackedDouble in {
defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
- VR128, memopv2f64, i128mem, 0>, VEX_4V;
+ VR128, memopv2f64, f128mem, 0>, VEX_4V;
defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
- int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V;
+ int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
}
defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6625,10 +6562,10 @@ let Predicates = [HasAVX] in {
}
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
- VR128, memopv4f32, i128mem, 0>, VEX_4V;
+ VR128, memopv4f32, f128mem, 0>, VEX_4V;
let ExeDomain = SSEPackedDouble in
defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
- VR128, memopv2f64, i128mem, 0>, VEX_4V;
+ VR128, memopv2f64, f128mem, 0>, VEX_4V;
let ExeDomain = SSEPackedSingle in
defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
VR256, memopv8f32, i256mem, 0>, VEX_4V;
@@ -6647,10 +6584,10 @@ let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
let ExeDomain = SSEPackedSingle in
defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
- VR128, memopv4f32, i128mem>;
+ VR128, memopv4f32, f128mem>;
let ExeDomain = SSEPackedDouble in
defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
- VR128, memopv2f64, i128mem>;
+ VR128, memopv2f64, f128mem>;
defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
VR128, memopv2i64, i128mem>;
defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
@@ -6658,10 +6595,10 @@ let Constraints = "$src1 = $dst" in {
}
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
- VR128, memopv4f32, i128mem>;
+ VR128, memopv4f32, f128mem>;
let ExeDomain = SSEPackedDouble in
defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
- VR128, memopv2f64, i128mem>;
+ VR128, memopv2f64, f128mem>;
}
/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6687,15 +6624,15 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
+defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
memopv2f64, int_x86_sse41_blendvpd>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
memopv4f64, int_x86_avx_blendv_pd_256>;
} // ExeDomain = SSEPackedDouble
let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
+defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
memopv4f32, int_x86_sse41_blendvps>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
memopv8f32, int_x86_avx_blendv_ps_256>;
} // ExeDomain = SSEPackedSingle
defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
@@ -6766,7 +6703,7 @@ let Predicates = [HasAVX2] in {
/// SS41I_ternary_int - SSE 4.1 ternary operator
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
- Intrinsic IntId> {
+ X86MemOperand x86memop, Intrinsic IntId> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
@@ -6775,7 +6712,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
OpSize;
def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2),
+ (ins VR128:$src1, x86memop:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $dst|$dst, $src2}"),
[(set VR128:$dst,
@@ -6785,14 +6722,28 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
}
let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
int_x86_sse41_blendvpd>;
let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
int_x86_sse41_blendvps>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
int_x86_sse41_pblendvb>;
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+ (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
let Predicates = [HasSSE41] in {
def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
(v16i8 VR128:$src2))),
@@ -7204,52 +7155,50 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
OpSize;
//===----------------------------------------------------------------------===//
-// CLMUL Instructions
+// PCLMUL Instructions
//===----------------------------------------------------------------------===//
-// Carry-less Multiplication instructions
-let neverHasSideEffects = 1 in {
// AVX carry-less Multiplication instructions
-def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>;
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
-let mayLoad = 1 in
-def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>;
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))]>;
+// Carry-less Multiplication instructions
let Constraints = "$src1 = $dst" in {
-def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>;
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
-let mayLoad = 1 in
-def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>;
+ [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+ (memopv2i64 addr:$src2), imm:$src3))]>;
} // Constraints = "$src1 = $dst"
-} // neverHasSideEffects = 1
multiclass pclmul_alias<string asm, int immop> {
- def : InstAlias<!strconcat("pclmul", asm,
- "dq {$src, $dst|$dst, $src}"),
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
(PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
- def : InstAlias<!strconcat("pclmul", asm,
- "dq {$src, $dst|$dst, $src}"),
+ def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
(PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
- def : InstAlias<!strconcat("vpclmul", asm,
+ def : InstAlias<!strconcat("vpclmul", asm,
"dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
(VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
- def : InstAlias<!strconcat("vpclmul", asm,
+ def : InstAlias<!strconcat("vpclmul", asm,
"dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
(VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
}
@@ -7259,6 +7208,45 @@ defm : pclmul_alias<"lqhq", 0x10>;
defm : pclmul_alias<"lqlq", 0x00>;
//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+ (ins VR128:$src, i8imm:$len, i8imm:$idx),
+ "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+ [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+ imm:$idx))]>, TB, OpSize;
+def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "extrq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+ VR128:$mask))]>, TB, OpSize;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+ "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+ [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+ VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src, VR128:$mask),
+ "insertq\t{$mask, $src|$src, $mask}",
+ [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+ VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movntss\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movntsd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
// AVX Instructions
//===----------------------------------------------------------------------===//
@@ -7286,7 +7274,7 @@ let ExeDomain = SSEPackedSingle in {
int_x86_avx_vbroadcast_ss_256>;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
int_x86_avx_vbroadcast_sd_256>;
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
int_x86_avx_vbroadcastf128_pd_256>;
@@ -7298,8 +7286,8 @@ let ExeDomain = SSEPackedSingle in {
int_x86_avx2_vbroadcast_ss_ps_256>;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
- int_x86_avx2_vbroadcast_sd_pd_256>;
+def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+ int_x86_avx2_vbroadcast_sd_pd_256>;
let Predicates = [HasAVX2] in
def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
@@ -7595,7 +7583,6 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
[(set RC:$dst, (Int VR128:$src))]>,
@@ -7604,27 +7591,26 @@ let Predicates = [HasAVX, HasF16C] in {
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
}
-}
multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32i8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
TA, OpSize, VEX;
- let neverHasSideEffects = 1, mayLoad = 1 in
- def mr : Ii8<0x1D, MRMDestMem, (outs x86memop:$dst),
- (ins RC:$src1, i32i8imm:$src2),
+ let neverHasSideEffects = 1, mayStore = 1 in
+ def mr : Ii8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
TA, OpSize, VEX;
}
-}
-defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
-defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+let Predicates = [HasAVX, HasF16C] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+}
//===----------------------------------------------------------------------===//
// AVX2 Instructions
@@ -7711,6 +7697,55 @@ let Predicates = [HasAVX2] in {
(VPBROADCASTQrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
(VPBROADCASTQYrm addr:$src)>;
+
+ def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBrr VR128:$src)>;
+ def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBYrr VR128:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWrr VR128:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWYrr VR128:$src)>;
+ def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDrr VR128:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDYrr VR128:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQYrr VR128:$src)>;
+ def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSrr VR128:$src)>;
+ def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSYrr VR128:$src)>;
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VBROADCASTSDYrr VR128:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSrr
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VBROADCASTSSYrr
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VBROADCASTSDYrr
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSrr
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VBROADCASTSSYrr
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VBROADCASTSDYrr
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>;
+ }
}
// AVX1 broadcast patterns
@@ -7718,16 +7753,62 @@ let Predicates = [HasAVX] in {
def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
(VBROADCASTSSYrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
- (VBROADCASTSDrm addr:$src)>;
+ (VBROADCASTSDYrm addr:$src)>;
def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSYrm addr:$src)>;
def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
- (VBROADCASTSDrm addr:$src)>;
-
+ (VBROADCASTSDYrm addr:$src)>;
def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
(VBROADCASTSSrm addr:$src)>;
+
+ // Provide fallback in case the load node that is used in the patterns above
+ // is used by additional users, which prevents the pattern selection.
+ let AddedComplexity = 20 in {
+ // 128bit broadcasts:
+ def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>;
+ def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss),
+ 0), 1)>;
+ def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd),
+ 0x44),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd),
+ 0x44), 1)>;
+
+ def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0)>;
+ def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss),
+ 0), 1)>;
+ def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd),
+ 0x44),
+ sub_xmm),
+ (VPSHUFDri
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd),
+ 0x44), 1)>;
+ }
}
//===----------------------------------------------------------------------===//
@@ -7820,8 +7901,8 @@ let neverHasSideEffects = 1 in {
def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR128:$src2, i8imm:$src3),
"vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>,
- VEX_4V;
+ []>, VEX_4V;
+let mayLoad = 1 in
def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i128mem:$src2, i8imm:$src3),
"vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7954,3 +8035,30 @@ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256> {
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+ (ins VR128:$src1, memop128:$src2, VR128:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3;
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+ (ins RC256:$src1, memop256:$src2, RC256:$mask),
+ !strconcat(OpcodeStr,
+ "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+ []>, VEX_4VOp3, VEX_L;
+}
+
+let Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index bddba6c..ea716bf 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -14,7 +14,8 @@
//===----------------------------------------------------------------------===//
let Defs = [RAX, RDX] in
- def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
+ def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+ TB;
let Defs = [RAX, RCX, RDX] in
def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
@@ -26,14 +27,17 @@ let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
}
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
-def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
// Interrupt and SysCall Instructions.
let Uses = [EFLAGS] in
def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
- [(int_x86_int (i8 3))]>;
+ [(int_x86_int (i8 3))], IIC_INT3>;
+
+def : Pat<(debugtrap),
+ (INT3)>;
// The long form of "int $3" turns into int3 as a size optimization.
// FIXME: This doesn't work because InstAlias can't match immediate constants.
@@ -41,23 +45,25 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
- [(int_x86_int imm:$trap)]>;
+ [(int_x86_int imm:$trap)], IIC_INT>;
-def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
-def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
-def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB,
+def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
Requires<[In64BitMode]>;
-def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
-
-def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+ IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+ IIC_SYS_ENTER_EXIT>, TB;
def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
Requires<[In64BitMode]>;
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>;
-def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
Requires<[In64BitMode]>;
@@ -66,73 +72,73 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
//
let Defs = [AL], Uses = [DX] in
def IN8rr : I<0xEC, RawFrm, (outs), (ins),
- "in{b}\t{%dx, %al|AL, DX}", []>;
+ "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
let Defs = [AX], Uses = [DX] in
def IN16rr : I<0xED, RawFrm, (outs), (ins),
- "in{w}\t{%dx, %ax|AX, DX}", []>, OpSize;
+ "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize;
let Defs = [EAX], Uses = [DX] in
def IN32rr : I<0xED, RawFrm, (outs), (ins),
- "in{l}\t{%dx, %eax|EAX, DX}", []>;
+ "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
let Defs = [AL] in
def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
- "in{b}\t{$port, %al|AL, $port}", []>;
+ "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
let Defs = [AX] in
def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{w}\t{$port, %ax|AX, $port}", []>, OpSize;
+ "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
let Defs = [EAX] in
def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{l}\t{$port, %eax|EAX, $port}", []>;
+ "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
let Uses = [DX, AL] in
def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
- "out{b}\t{%al, %dx|DX, AL}", []>;
+ "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
let Uses = [DX, AX] in
def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
- "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize;
+ "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
let Uses = [DX, EAX] in
def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
- "out{l}\t{%eax, %dx|DX, EAX}", []>;
+ "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
let Uses = [AL] in
def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
- "out{b}\t{%al, $port|$port, AL}", []>;
+ "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
let Uses = [AX] in
def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{w}\t{%ax, $port|$port, AX}", []>, OpSize;
+ "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
let Uses = [EAX] in
def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{l}\t{%eax, $port|$port, EAX}", []>;
+ "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
-def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>;
-def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>, OpSize;
-def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", []>;
+def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
//===----------------------------------------------------------------------===//
// Moves to and from debug registers
def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
//===----------------------------------------------------------------------===//
// Moves to and from control registers
def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
//===----------------------------------------------------------------------===//
// Segment override instruction prefixes
@@ -150,254 +156,265 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
//
def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", []>;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", []>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
//===----------------------------------------------------------------------===//
// Segmentation support instructions.
-def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
- "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
- "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
- "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
- "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize;
def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
- "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
-def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+ [], IIC_INVLPG>, TB;
def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
- "str{w}\t$dst", []>, TB, OpSize;
+ "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
- "str{l}\t$dst", []>, TB;
+ "str{l}\t$dst", [], IIC_STR>, TB;
def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
- "str{q}\t$dst", []>, TB;
+ "str{q}\t$dst", [], IIC_STR>, TB;
def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
- "str{w}\t$dst", []>, TB;
+ "str{w}\t$dst", [], IIC_STR>, TB;
def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
- "ltr{w}\t$src", []>, TB;
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
- "ltr{w}\t$src", []>, TB;
+ "ltr{w}\t$src", [], IIC_LTR>, TB;
def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
- "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
+ "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
- "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>;
+ "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
- "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize;
+ "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
- "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>;
+ "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
- "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize;
+ "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
- "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>;
+ "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
- "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize;
+ "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ OpSize;
def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
- "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>;
+ "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
- "push{w}\t{%fs|FS}", []>, OpSize, TB;
+ "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
- "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
- "push{w}\t{%gs|GS}", []>, OpSize, TB;
+ "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
- "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
- "push{q}\t{%fs|FS}", []>, TB;
+ "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
- "push{q}\t{%gs|GS}", []>, TB;
+ "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
// No "pop cs" instruction.
def POPSS16 : I<0x17, RawFrm, (outs), (ins),
- "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>;
+ "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ OpSize, Requires<[In32BitMode]>;
def POPSS32 : I<0x17, RawFrm, (outs), (ins),
- "pop{l}\t{%ss|SS}", []> , Requires<[In32BitMode]>;
+ "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ Requires<[In32BitMode]>;
def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
- "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>;
+ "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+ OpSize, Requires<[In32BitMode]>;
def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
- "pop{l}\t{%ds|DS}", []> , Requires<[In32BitMode]>;
+ "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+ Requires<[In32BitMode]>;
def POPES16 : I<0x07, RawFrm, (outs), (ins),
- "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>;
+ "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+ OpSize, Requires<[In32BitMode]>;
def POPES32 : I<0x07, RawFrm, (outs), (ins),
- "pop{l}\t{%es|ES}", []> , Requires<[In32BitMode]>;
+ "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+ Requires<[In32BitMode]>;
def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
- "pop{w}\t{%fs|FS}", []>, OpSize, TB;
+ "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
- "pop{l}\t{%fs|FS}", []>, TB , Requires<[In32BitMode]>;
+ "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
- "pop{q}\t{%fs|FS}", []>, TB;
+ "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
- "pop{w}\t{%gs|GS}", []>, OpSize, TB;
+ "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
- "pop{l}\t{%gs|GS}", []>, TB , Requires<[In32BitMode]>;
+ "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
- "pop{q}\t{%gs|GS}", []>, TB;
+ "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lds{l}\t{$src, $dst|$dst, $src}", []>;
+ "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lss{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+ "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "les{l}\t{$src, $dst|$dst, $src}", []>;
+ "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
- "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+ "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
- "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
- "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+ "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
- "verr\t$seg", []>, TB;
+ "verr\t$seg", [], IIC_VERR>, TB;
def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
- "verr\t$seg", []>, TB;
+ "verr\t$seg", [], IIC_VERR>, TB;
def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
- "verw\t$seg", []>, TB;
+ "verw\t$seg", [], IIC_VERW_MEM>, TB;
def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
- "verw\t$seg", []>, TB;
+ "verw\t$seg", [], IIC_VERW_REG>, TB;
//===----------------------------------------------------------------------===//
// Descriptor-table support instructions
def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
- "sgdtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+ "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
- "sgdt\t$dst", []>, TB;
+ "sgdt\t$dst", [], IIC_SGDT>, TB;
def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
- "sidtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+ "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
"sidt\t$dst", []>, TB;
def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
- "sldt{w}\t$dst", []>, TB, OpSize;
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
- "sldt{w}\t$dst", []>, TB;
+ "sldt{w}\t$dst", [], IIC_SLDT>, TB;
def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
- "sldt{l}\t$dst", []>, TB;
+ "sldt{l}\t$dst", [], IIC_SLDT>, TB;
// LLDT is not interpreted specially in 64-bit mode because there is no sign
// extension.
def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
- "sldt{q}\t$dst", []>, TB;
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
- "sldt{q}\t$dst", []>, TB;
+ "sldt{q}\t$dst", [], IIC_SLDT>, TB;
def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
- "lgdtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+ "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
- "lgdt\t$src", []>, TB;
+ "lgdt\t$src", [], IIC_LGDT>, TB;
def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
- "lidtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+ "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
- "lidt\t$src", []>, TB;
+ "lidt\t$src", [], IIC_LIDT>, TB;
def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
- "lldt{w}\t$src", []>, TB;
+ "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
- "lldt{w}\t$src", []>, TB;
+ "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
//===----------------------------------------------------------------------===//
// Specialized register support
-def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
-def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
-def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
- "smsw{w}\t$dst", []>, OpSize, TB;
+ "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
- "smsw{l}\t$dst", []>, TB;
+ "smsw{l}\t$dst", [], IIC_SMSW>, TB;
// no m form encodable; use SMSW16m
def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
- "smsw{q}\t$dst", []>, TB;
+ "smsw{q}\t$dst", [], IIC_SMSW>, TB;
// For memory operands, there is only a 16-bit form
def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
- "smsw{w}\t$dst", []>, TB;
+ "smsw{w}\t$dst", [], IIC_SMSW>, TB;
def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
- "lmsw{w}\t$src", []>, TB;
+ "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
- "lmsw{w}\t$src", []>, TB;
+ "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
-def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
//===----------------------------------------------------------------------===//
// Cache instructions
-def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
//===----------------------------------------------------------------------===//
// XSAVE instructions
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 6a8f0c8..6d3548f 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -17,17 +17,17 @@
// 66 0F 38 80
def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In32BitMode]>;
def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In64BitMode]>;
// 66 0F 38 81
def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In32BitMode]>;
def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+ "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
Requires<[In64BitMode]>;
// 0F 01 C1
def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 65bbcb5..8ec2c68 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -15,7 +15,7 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int VR128:$src))]>, VEX;
- def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
}
@@ -36,27 +36,19 @@ let isAsmParserOnly = 1 in {
defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
- defm VFRCZPS : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
- defm VFRCZPD : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
}
// Scalar load 2 addr operand instructions
-let Constraints = "$src1 = $dst" in {
multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
Operand memop, ComplexPattern mem_cpat> {
- def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
- VR128:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX;
- def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
- memop:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR128:$dst, (Int VR128:$src1,
- (bitconvert mem_cpat:$src2)))]>, VEX;
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
}
-} // Constraints = "$src1 = $dst"
-
let isAsmParserOnly = 1 in {
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
ssmem, sse_load_f32>;
@@ -64,12 +56,26 @@ let isAsmParserOnly = 1 in {
sdmem, sse_load_f64>;
}
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+ PatFrag memop> {
+ def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+ def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+ defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+ defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
PatFrag memop> {
def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+ [(set VR256:$dst, (Int VR256:$src))]>, VEX;
def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
@@ -88,13 +94,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2),
+ (ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
VEX_4V, VEX_W;
def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
- (ins f128mem:$src1, VR128:$src2),
+ (ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
@@ -116,25 +122,23 @@ let isAsmParserOnly = 1 in {
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
}
-multiclass xop3opimm<bits<8> opc, string OpcodeStr> {
- let neverHasSideEffects = 1 in {
- def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, i8imm:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX;
- let mayLoad = 1 in
- def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins f128mem:$src1, i8imm:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, VEX;
- }
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+ def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+ def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
}
let isAsmParserOnly = 1 in {
- defm VPROTW : xop3opimm<0xC1, "vprotw">;
- defm VPROTQ : xop3opimm<0xC3, "vprotq">;
- defm VPROTD : xop3opimm<0xC2, "vprotd">;
- defm VPROTB : xop3opimm<0xC0, "vprotb">;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+ defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
}
// Instruction where second source can be memory, but third must be register
@@ -146,7 +150,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
[(set VR128:$dst,
(Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
@@ -170,32 +174,31 @@ let isAsmParserOnly = 1 in {
}
// Instruction where second source can be memory, third must be imm8
-multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType VT> {
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst,
- (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V;
+ [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ VEX_4V;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2, i8imm:$src3),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
- imm:$src3)))]>, VEX_4V;
+ (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+ imm:$src3))]>, VEX_4V;
}
let isAsmParserOnly = 1 in {
- defm VPCOMB : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>;
- defm VPCOMW : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>;
- defm VPCOMD : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>;
- defm VPCOMQ : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>;
- defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>;
- defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>;
- defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>;
- defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>;
+ defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+ defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+ defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+ defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+ defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+ defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+ defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+ defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
}
// Instruction where either second or third source can be memory
@@ -207,7 +210,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
[(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
VEX_4V, VEX_I8IMM;
def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
@@ -215,7 +218,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
(bitconvert (memopv2i64 addr:$src3))))]>,
VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ (ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
@@ -237,7 +240,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
[(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
VEX_4V, VEX_I8IMM;
def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b578e8d..df7507c 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -156,10 +156,14 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
break;
case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+ case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+ case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+ case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+ case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
@@ -550,17 +554,38 @@ ReSimplify:
static void LowerTlsAddr(MCStreamer &OutStreamer,
X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
- bool is64Bits = MI.getOpcode() == X86::TLS_addr64;
+
+ bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
+
+ bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
MCContext &context = OutStreamer.getContext();
- if (is64Bits) {
+ if (needsPadding) {
MCInst prefix;
prefix.setOpcode(X86::DATA16_PREFIX);
OutStreamer.EmitInstruction(prefix);
}
+
+ MCSymbolRefExpr::VariantKind SRVK;
+ switch (MI.getOpcode()) {
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSGD;
+ break;
+ case X86::TLS_base_addr32:
+ SRVK = MCSymbolRefExpr::VK_TLSLDM;
+ break;
+ case X86::TLS_base_addr64:
+ SRVK = MCSymbolRefExpr::VK_TLSLD;
+ break;
+ default:
+ llvm_unreachable("unexpected opcode");
+ }
+
MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
- const MCSymbolRefExpr *symRef =
- MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context);
+ const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
MCInst LEA;
if (is64Bits) {
@@ -571,6 +596,14 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
LEA.addOperand(MCOperand::CreateReg(0)); // index
LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp
LEA.addOperand(MCOperand::CreateReg(0)); // seg
+ } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+ LEA.setOpcode(X86::LEA32r);
+ LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+ LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+ LEA.addOperand(MCOperand::CreateImm(1)); // scale
+ LEA.addOperand(MCOperand::CreateReg(0)); // index
+ LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp
+ LEA.addOperand(MCOperand::CreateReg(0)); // seg
} else {
LEA.setOpcode(X86::LEA32r);
LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
@@ -582,7 +615,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
}
OutStreamer.EmitInstruction(LEA);
- if (is64Bits) {
+ if (needsPadding) {
MCInst prefix;
prefix.setOpcode(X86::DATA16_PREFIX);
OutStreamer.EmitInstruction(prefix);
@@ -609,8 +642,6 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
}
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
- OutStreamer.EmitCodeRegion();
-
X86MCInstLower MCInstLowering(Mang, *MF, *this);
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
@@ -646,6 +677,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::TLS_addr32:
case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
case X86::MOVPC32r: {
@@ -715,4 +748,3 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInstLowering.Lower(MI, TmpInst);
OutStreamer.EmitInstruction(TmpInst);
}
-
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index c747109..f83a525 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -66,6 +66,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// ArgumentStackSize - The number of bytes on stack consumed by the arguments
/// being passed on the stack.
unsigned ArgumentStackSize;
+ /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+ unsigned NumLocalDynamics;
public:
X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -79,7 +81,8 @@ public:
RegSaveFrameIndex(0),
VarArgsGPOffset(0),
VarArgsFPOffset(0),
- ArgumentStackSize(0) {}
+ ArgumentStackSize(0),
+ NumLocalDynamics(0) {}
explicit X86MachineFunctionInfo(MachineFunction &MF)
: ForceFramePointer(false),
@@ -93,8 +96,9 @@ public:
RegSaveFrameIndex(0),
VarArgsGPOffset(0),
VarArgsFPOffset(0),
- ArgumentStackSize(0) {}
-
+ ArgumentStackSize(0),
+ NumLocalDynamics(0) {}
+
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -130,6 +134,10 @@ public:
unsigned getArgumentStackSize() const { return ArgumentStackSize; }
void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+ void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index b56025f..acf53f8 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -50,6 +50,10 @@ ForceStackAlign("force-align-stack",
" needed for the function."),
cl::init(false), cl::Hidden);
+cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+ cl::desc("Enable use of a base pointer for complex stack frames"));
+
X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
const TargetInstrInfo &tii)
: X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
@@ -68,10 +72,12 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
SlotSize = 8;
StackPtr = X86::RSP;
FramePtr = X86::RBP;
+ BasePtr = X86::RBX;
} else {
SlotSize = 4;
StackPtr = X86::ESP;
FramePtr = X86::EBP;
+ BasePtr = X86::EBX;
}
}
@@ -90,6 +96,12 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
return -1;
}
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+ // Only enable when post-RA scheduling is enabled and this is needed.
+ return TM.getSubtargetImpl()->postRAScheduler();
+}
+
int
X86RegisterInfo::getSEHRegNum(unsigned i) const {
int reg = X86_MC::getX86RegNum(i);
@@ -146,7 +158,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
// The GR8_NOREX class is always used in a way that won't be constrained to a
// sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
// full GR8 class.
- if (RC == X86::GR8_NOREXRegisterClass)
+ if (RC == &X86::GR8_NOREXRegClass)
return RC;
const TargetRegisterClass *Super = RC;
@@ -175,7 +187,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
}
const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+ const {
switch (Kind) {
default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
case 0: // Normal GPRs.
@@ -238,7 +251,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
}
if (ghcCall)
- return CSR_Ghc_SaveList;
+ return CSR_NoRegs_SaveList;
if (Is64Bit) {
if (IsWin64)
return CSR_Win64_SaveList;
@@ -254,7 +267,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
if (CC == CallingConv::GHC)
- return CSR_Ghc_RegMask;
+ return CSR_NoRegs_RegMask;
if (!Is64Bit)
return CSR_32_RegMask;
if (IsWin64)
@@ -268,21 +281,33 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Set the stack-pointer register and its aliases as reserved.
Reserved.set(X86::RSP);
- Reserved.set(X86::ESP);
- Reserved.set(X86::SP);
- Reserved.set(X86::SPL);
+ for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+ Reserved.set(*I);
// Set the instruction pointer register and its aliases as reserved.
Reserved.set(X86::RIP);
- Reserved.set(X86::EIP);
- Reserved.set(X86::IP);
+ for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+ Reserved.set(*I);
// Set the frame-pointer register and its aliases as reserved if needed.
if (TFI->hasFP(MF)) {
Reserved.set(X86::RBP);
- Reserved.set(X86::EBP);
- Reserved.set(X86::BP);
- Reserved.set(X86::BPL);
+ for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+ Reserved.set(*I);
+ }
+
+ // Set the base-pointer register and its aliases as reserved if needed.
+ if (hasBasePointer(MF)) {
+ CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ const uint32_t* RegMask = getCallPreservedMask(CC);
+ if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+ report_fatal_error(
+ "Stack realignment in presence of dynamic allocas is not supported with"
+ "this calling convention.");
+
+ Reserved.set(getBaseRegister());
+ for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+ Reserved.set(*I);
}
// Mark the segment registers as reserved.
@@ -293,6 +318,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(X86::FS);
Reserved.set(X86::GS);
+ // Mark the floating point stack registers as reserved.
+ Reserved.set(X86::ST0);
+ Reserved.set(X86::ST1);
+ Reserved.set(X86::ST2);
+ Reserved.set(X86::ST3);
+ Reserved.set(X86::ST4);
+ Reserved.set(X86::ST5);
+ Reserved.set(X86::ST6);
+ Reserved.set(X86::ST7);
+
// Reserve the registers that only exist in 64-bit mode.
if (!Is64Bit) {
// These 8-bit registers are part of the x86-64 extension even though their
@@ -308,14 +343,13 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
X86::R8, X86::R9, X86::R10, X86::R11,
X86::R12, X86::R13, X86::R14, X86::R15
};
- for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI)
- Reserved.set(Reg);
+ for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
// XMM8, XMM9, ...
assert(X86::XMM15 == X86::XMM8+7);
- for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
- ++AI)
- Reserved.set(Reg);
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
}
}
@@ -326,10 +360,36 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Stack Frame Processing methods
//===----------------------------------------------------------------------===//
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment and there are dynamic allocas, we can't
+ // reference off of the stack pointer, so we reserve a base pointer.
+ if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+ return true;
+
+ return false;
+}
+
bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
- return (MF.getTarget().Options.RealignStack &&
- !MFI->hasVarSizedObjects());
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ if (!MF.getTarget().Options.RealignStack)
+ return false;
+
+ // Stack realignment requires a frame pointer. If we already started
+ // register allocation with frame pointer elimination, it is too late now.
+ if (!MRI->canReserveReg(FramePtr))
+ return false;
+
+ // If a base pointer is necessary. Check that it isn't too late to reserve
+ // it.
+ if (MFI->hasVarSizedObjects())
+ return MRI->canReserveReg(BasePtr);
+ return true;
}
bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
@@ -339,13 +399,6 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
F->hasFnAttr(Attribute::StackAlignment));
- // FIXME: Currently we don't support stack realignment for functions with
- // variable-sized allocas.
- // FIXME: It's more complicated than this...
- if (0 && requiresRealignment && MFI->hasVarSizedObjects())
- report_fatal_error(
- "Stack realignment in presence of dynamic allocas is not supported");
-
// If we've requested that we force align the stack do so now.
if (ForceStackAlign)
return canRealignStack(MF);
@@ -485,7 +538,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned Opc = MI.getOpcode();
bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
- if (needsStackRealignment(MF))
+ if (hasBasePointer(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+ else if (needsStackRealignment(MF))
BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
else if (AfterFPPop)
BasePtr = StackPtr;
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index bee0393..1bc32cb 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -50,6 +50,11 @@ private:
///
unsigned FramePtr;
+ /// BasePtr - X86 physical register used as a base ptr in complex stack
+ /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+ /// variable size stack objects.
+ unsigned BasePtr;
+
public:
X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
@@ -65,7 +70,8 @@ public:
int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
/// Code Generation virtual methods...
- ///
+ ///
+ virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
/// getMatchingSuperRegClass - Return a subclass of the specified register
/// class A so that each register in it has a sub-register of the
@@ -82,7 +88,8 @@ public:
/// getPointerRegClass - Returns a TargetRegisterClass used for pointer
/// values.
- const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
/// getCrossCopyRegClass - Returns a legal register class to copy a register
/// in the specified class to or from. Returns NULL if it is possible to copy
@@ -104,6 +111,8 @@ public:
/// register scavenger to determine what registers are free.
BitVector getReservedRegs(const MachineFunction &MF) const;
+ bool hasBasePointer(const MachineFunction &MF) const;
+
bool canRealignStack(const MachineFunction &MF) const;
bool needsStackRealignment(const MachineFunction &MF) const;
@@ -121,6 +130,7 @@ public:
// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const;
unsigned getStackRegister() const { return StackPtr; }
+ unsigned getBaseRegister() const { return BasePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 5263a49..ae2d4d0 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -223,6 +223,9 @@ let Namespace = "X86" in {
def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
+ // Floating-point status word
+ def FPSW : Register<"fpsw">;
+
// Status flags register
def EFLAGS : Register<"flags">;
@@ -296,26 +299,18 @@ def GR8 : RegisterClass<"X86", [i8], 8,
def GR16 : RegisterClass<"X86", [i16], 16,
(add AX, CX, DX, SI, DI, BX, BP, SP,
- R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
-}
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
def GR32 : RegisterClass<"X86", [i32], 32,
(add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
- R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
// RIP isn't really a register and it can't be used anywhere except in an
// address, but it doesn't cause trouble.
def GR64 : RegisterClass<"X86", [i64], 64,
(add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
- RBX, R14, R15, R12, R13, RBP, RSP, RIP)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
- (GR16 sub_16bit),
- (GR32 sub_32bit)];
-}
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
// Segment registers for use by MOV instructions (and others) that have a
// segment register as one operand. Always contain a 16-bit segment
@@ -336,30 +331,12 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
// operations.
def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
-def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> {
- let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)];
-}
-def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> {
- let SubRegClasses = [(GR8_ABCD_L sub_8bit),
- (GR8_ABCD_H sub_8bit_hi),
- (GR16_ABCD sub_16bit)];
-}
-def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> {
- let SubRegClasses = [(GR8_ABCD_L sub_8bit),
- (GR8_ABCD_H sub_8bit_hi),
- (GR16_ABCD sub_16bit),
- (GR32_ABCD sub_32bit)];
-}
-def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
- R8, R9, R11, RIP)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
- (GR16 sub_16bit),
- (GR32_TC sub_32bit)];
-}
-
+ R8, R9, R11, RIP)>;
def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
R8, R9, R11)>;
@@ -373,64 +350,36 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
}
// GR16_NOREX - GR16 registers which do not require a REX prefix.
def GR16_NOREX : RegisterClass<"X86", [i16], 16,
- (add AX, CX, DX, SI, DI, BX, BP, SP)> {
- let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
-}
+ (add AX, CX, DX, SI, DI, BX, BP, SP)>;
// GR32_NOREX - GR32 registers which do not require a REX prefix.
def GR32_NOREX : RegisterClass<"X86", [i32], 32,
- (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> {
- let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
- (GR16_NOREX sub_16bit)];
-}
+ (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
// GR64_NOREX - GR64 registers which do not require a REX prefix.
def GR64_NOREX : RegisterClass<"X86", [i64], 64,
- (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> {
- let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
- (GR16_NOREX sub_16bit),
- (GR32_NOREX sub_32bit)];
-}
+ (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
// to clear upper 32-bits of RAX so is not a NOP.
-def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
// GR32_NOSP - GR32 registers except ESP.
-def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
// GR64_NOSP - GR64 registers except RSP (and RIP).
-def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> {
- let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
- (GR16 sub_16bit),
- (GR32_NOSP sub_32bit)];
-}
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
// ESP.
def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
- (and GR32_NOREX, GR32_NOSP)> {
- let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
- (GR16_NOREX sub_16bit)];
-}
+ (and GR32_NOREX, GR32_NOSP)>;
// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
- (and GR64_NOREX, GR64_NOSP)> {
- let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
- (GR16_NOREX sub_16bit),
- (GR32_NOREX_NOSP sub_32bit)];
-}
+ (and GR64_NOREX, GR64_NOSP)>;
// A class to support the 'A' assembler constraint: EAX then EDX.
-def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> {
- let SubRegClasses = [(GR8_ABCD_L sub_8bit),
- (GR8_ABCD_H sub_8bit_hi),
- (GR16_ABCD sub_16bit)];
-}
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
// Scalar SSE2 floating point registers.
def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
@@ -458,17 +407,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
// Generic vector registers: VR64 and VR128.
def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- 128, (add FR32)> {
- let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-}
-
+ 128, (add FR32)>;
def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- 256, (sequence "YMM%u", 0, 15)> {
- let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
-}
+ 256, (sequence "YMM%u", 0, 15)>;
// Status flags registers.
def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
let CopyCost = -1; // Don't allow copying of status registers.
let isAllocatable = 0;
}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+ let isAllocatable = 0;
+}
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 17f4efd..c14407f 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for X86
+// Instruction Itinerary classes used for X86
def IIC_DEFAULT : InstrItinClass;
def IIC_ALU_MEM : InstrItinClass;
def IIC_ALU_NONMEM : InstrItinClass;
@@ -253,6 +253,42 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
def IIC_CMPX_LOCK : InstrItinClass;
def IIC_CMPX_LOCK_8 : InstrItinClass;
def IIC_CMPX_LOCK_8B : InstrItinClass;
@@ -261,13 +297,185 @@ def IIC_CMPX_LOCK_16B : InstrItinClass;
def IIC_XADD_LOCK_MEM : InstrItinClass;
def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BSF : InstrItinClass;
+def IIC_BSR : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
//===----------------------------------------------------------------------===//
// Processor instruction itineraries.
-def GenericItineraries : ProcessorItineraries<[], [], []>;
+// IssueWidth is analagous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MinLatency=0 indicates that RAW dependencies can be decoded in the
+// same cycle.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruciton itineraries.
+def GenericModel : SchedMachineModel {
+ let IssueWidth = 4;
+ let MinLatency = 0;
+ let LoadLatency = 4;
+ let HighLatency = 10;
+}
include "X86ScheduleAtom.td"
-
-
-
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 77d4e56..8710261 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -106,7 +106,7 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
// set
- InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
// jcc
InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
@@ -294,12 +294,237 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+ // MMX MOVs
+ InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+ // other MMX
+ InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+ // conversions
+ // from/to PD
+ InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+ // from/to PI
+ InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+ InstrStage<5, [Port1]>]>,
+
InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
- InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >
+ InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+ InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+ InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >,
+ InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >,
+ InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
+ InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+ // System instructions
+ InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >,
+ InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >,
+ InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+ InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >,
+ InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >,
+ InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+ InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
+ InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
+ InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+ InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >,
+ InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+ InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+ // worst case for mov REG_CRx
+ InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+ // LAR
+ InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >,
+ InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >,
+ // LSL
+ InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >,
+ InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+ InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+ // push control register, segment registers
+ InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+ // pop control register, segment registers
+ InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+ // VERR, VERW
+ InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+ InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+ // WRMSR, RDMSR
+ InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+ // SMSW, LMSW
+ InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+ InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+ InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+ InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+ InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+ InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+ InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+ InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+ InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+ InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+ InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+ InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+ InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+ InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+ InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+ InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+
+ InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
]>;
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+ let IssueWidth = 2; // Allows 2 instructions per scheduling group.
+ let MinLatency = 1; // InstrStage cycles overrides MinLatency.
+ // OperandCycles may be used for expected latency.
+ let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+ let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+ let Itineraries = AtomItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 9a04e35..7c6788f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -62,13 +62,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
Args.push_back(Entry);
Entry.Node = Size;
Args.push_back(Entry);
- std::pair<SDValue,SDValue> CallResult =
- TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+ TargetLowering::
+ CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
false, false, false, false,
0, CallingConv::C, /*isTailCall=*/false,
/*doesNotRet=*/false, /*isReturnValueUsed=*/false,
DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
DAG, dl);
+ std::pair<SDValue,SDValue> CallResult =
+ TLI.LowerCallTo(CLI);
return CallResult.second;
}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ed1a409..e6e9c56 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -196,33 +196,32 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
- // FIXME: AVX codegen support is not ready.
- //if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); }
+ if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); }
bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
- if (IsIntel && ((ECX >> 1) & 0x1)) {
- HasCLMUL = true;
- ToggleFeature(X86::FeatureCLMUL);
+ if ((ECX >> 1) & 0x1) {
+ HasPCLMUL = true;
+ ToggleFeature(X86::FeaturePCLMUL);
}
- if (IsIntel && ((ECX >> 12) & 0x1)) {
- HasFMA3 = true;
- ToggleFeature(X86::FeatureFMA3);
+ if ((ECX >> 12) & 0x1) {
+ HasFMA = true;
+ ToggleFeature(X86::FeatureFMA);
}
if (IsIntel && ((ECX >> 22) & 0x1)) {
HasMOVBE = true;
ToggleFeature(X86::FeatureMOVBE);
}
- if (IsIntel && ((ECX >> 23) & 0x1)) {
+ if ((ECX >> 23) & 0x1) {
HasPOPCNT = true;
ToggleFeature(X86::FeaturePOPCNT);
}
- if (IsIntel && ((ECX >> 25) & 0x1)) {
+ if ((ECX >> 25) & 0x1) {
HasAES = true;
ToggleFeature(X86::FeatureAES);
}
- if (IsIntel && ((ECX >> 29) & 0x1)) {
+ if ((ECX >> 29) & 0x1) {
HasF16C = true;
ToggleFeature(X86::FeatureF16C);
}
@@ -254,8 +253,12 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
}
// Set processor type. Currently only Atom is detected.
- if (Family == 6 && Model == 28) {
+ if (Family == 6 &&
+ (Model == 28 || Model == 38 || Model == 39
+ || Model == 53 || Model == 54)) {
X86ProcFamily = IntelAtom;
+
+ UseLeaForSP = true;
ToggleFeature(X86::FeatureLeaForSP);
}
@@ -289,9 +292,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
}
}
- if (IsIntel && MaxLevel >= 7) {
+ if (MaxLevel >= 7) {
if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
- if (EBX & 0x1) {
+ if (IsIntel && (EBX & 0x1)) {
HasFSGSBase = true;
ToggleFeature(X86::FeatureFSGSBase);
}
@@ -299,12 +302,11 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
HasBMI = true;
ToggleFeature(X86::FeatureBMI);
}
- // FIXME: AVX2 codegen support is not ready.
- //if ((EBX >> 5) & 0x1) {
- // X86SSELevel = AVX2;
- // ToggleFeature(X86::FeatureAVX2);
- //}
- if ((EBX >> 8) & 0x1) {
+ if (IsIntel && ((EBX >> 5) & 0x1)) {
+ X86SSELevel = AVX2;
+ ToggleFeature(X86::FeatureAVX2);
+ }
+ if (IsIntel && ((EBX >> 8) & 0x1)) {
HasBMI2 = true;
ToggleFeature(X86::FeatureBMI2);
}
@@ -325,8 +327,8 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, HasPOPCNT(false)
, HasSSE4A(false)
, HasAES(false)
- , HasCLMUL(false)
- , HasFMA3(false)
+ , HasPCLMUL(false)
+ , HasFMA(false)
, HasFMA4(false)
, HasXOP(false)
, HasMOVBE(false)
@@ -424,9 +426,7 @@ bool X86Subtarget::enablePostRAScheduler(
CodeGenOpt::Level OptLevel,
TargetSubtargetInfo::AntiDepBreakMode& Mode,
RegClassVector& CriticalPathRCs) const {
- //TODO: change back to ANTIDEP_CRITICAL when the
- // X86 subtarget properly sets up post RA liveness.
- Mode = TargetSubtargetInfo::ANTIDEP_NONE;
+ Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
CriticalPathRCs.clear();
return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 7fd832b..1af585f 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -85,11 +85,11 @@ protected:
/// HasAES - Target has AES instructions
bool HasAES;
- /// HasCLMUL - Target has carry-less multiplication
- bool HasCLMUL;
+ /// HasPCLMUL - Target has carry-less multiplication
+ bool HasPCLMUL;
- /// HasFMA3 - Target has 3-operand fused multiply-add
- bool HasFMA3;
+ /// HasFMA - Target has 3-operand fused multiply-add
+ bool HasFMA;
/// HasFMA4 - Target has 4-operand fused multiply-add
bool HasFMA4;
@@ -203,8 +203,8 @@ public:
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasPOPCNT() const { return HasPOPCNT; }
bool hasAES() const { return HasAES; }
- bool hasCLMUL() const { return HasCLMUL; }
- bool hasFMA3() const { return HasFMA3; }
+ bool hasPCLMUL() const { return HasPCLMUL; }
+ bool hasFMA() const { return HasFMA; }
bool hasFMA4() const { return HasFMA4; }
bool hasXOP() const { return HasXOP; }
bool hasMOVBE() const { return HasMOVBE; }
@@ -307,6 +307,8 @@ public:
TargetSubtargetInfo::AntiDepBreakMode& Mode,
RegClassVector& CriticalPathRCs) const;
+ bool postRAScheduler() const { return PostRAScheduler; }
+
/// getInstrItins = Return the instruction itineraries based on the
/// subtarget selection.
const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f4b7a62..b7ba568 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -140,39 +140,48 @@ public:
} // namespace
TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
- return new X86PassConfig(this, PM);
+ X86PassConfig *PC = new X86PassConfig(this, PM);
+
+ if (Subtarget.hasCMov())
+ PC->enablePass(&EarlyIfConverterID);
+
+ return PC;
}
bool X86PassConfig::addInstSelector() {
// Install an instruction selector.
- PM.add(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+ addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+ // For ELF, cleanup any local-dynamic TLS accesses.
+ if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+ addPass(createCleanupLocalDynamicTLSPass());
// For 32-bit, prepend instructions to set the "global base reg" for PIC.
if (!getX86Subtarget().is64Bit())
- PM.add(createGlobalBaseRegPass());
+ addPass(createGlobalBaseRegPass());
return false;
}
bool X86PassConfig::addPreRegAlloc() {
- PM.add(createX86MaxStackAlignmentHeuristicPass());
+ addPass(createX86MaxStackAlignmentHeuristicPass());
return false; // -print-machineinstr shouldn't print after this.
}
bool X86PassConfig::addPostRegAlloc() {
- PM.add(createX86FloatingPointStackifierPass());
+ addPass(createX86FloatingPointStackifierPass());
return true; // -print-machineinstr should print after this.
}
bool X86PassConfig::addPreEmitPass() {
bool ShouldPrint = false;
if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
- PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+ addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
ShouldPrint = true;
}
if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
- PM.add(createX86IssueVZeroUpperPass());
+ addPass(createX86IssueVZeroUpperPass());
ShouldPrint = true;
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 718f35e..92aee0d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -9,16 +9,19 @@
#include "X86TargetObjectFile.h"
#include "X86TargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Target/Mangler.h"
#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
using namespace llvm;
using namespace dwarf;
-const MCExpr *X8664_MachoTargetObjectFile::
+const MCExpr *X86_64MachoTargetObjectFile::
getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
MachineModuleInfo *MMI, unsigned Encoding,
MCStreamer &Streamer) const {
@@ -37,8 +40,14 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
}
-MCSymbol *X8664_MachoTargetObjectFile::
+MCSymbol *X86_64MachoTargetObjectFile::
getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
MachineModuleInfo *MMI) const {
return Mang->getSymbol(GV);
}
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index a02a368..2d320c5 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -16,9 +16,9 @@
namespace llvm {
- /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
/// x86-64.
- class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
public:
virtual const MCExpr *
getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
@@ -32,6 +32,12 @@ namespace llvm {
MachineModuleInfo *MMI) const;
};
+ /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+ /// and x86-64.
+ class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+ virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+ };
+
} // end namespace llvm
#endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 2fd78a7..e4f567f 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -145,7 +145,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
// to insert any VZEROUPPER instructions. This is constant-time, so it is
// cheap in the common case of no ymm use.
bool YMMUsed = false;
- const TargetRegisterClass *RC = X86::VR256RegisterClass;
+ const TargetRegisterClass *RC = &X86::VR256RegClass;
for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
i != e; i++) {
if (MRI.isPhysRegUsed(*i)) {
@@ -205,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
}
- // The entry MBB for the function may set the inital state to dirty if
+ // The entry MBB for the function may set the initial state to dirty if
// the function receives any YMM incoming arguments
if (MBB == MF.begin()) {
EntryState = ST_CLEAN;