aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp23
-rw-r--r--lib/Target/X86/CMakeLists.txt2
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp14
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.c23
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h74
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h106
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp21
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h6
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp16
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.h32
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h79
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp317
-rw-r--r--lib/Target/X86/README-SSE.txt31
-rw-r--r--lib/Target/X86/X86.td22
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp42
-rw-r--r--lib/Target/X86/X86AsmPrinter.h5
-rw-r--r--lib/Target/X86/X86CallingConv.td47
-rw-r--r--lib/Target/X86/X86FastISel.cpp102
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp8
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp21
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp22
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp919
-rw-r--r--lib/Target/X86/X86ISelLowering.h71
-rw-r--r--lib/Target/X86/X86InstrAVX512.td715
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td40
-rw-r--r--lib/Target/X86/X86InstrCompiler.td7
-rw-r--r--lib/Target/X86/X86InstrFPStack.td30
-rw-r--r--lib/Target/X86/X86InstrFormats.td122
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td63
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp174
-rw-r--r--lib/Target/X86/X86InstrInfo.h9
-rw-r--r--lib/Target/X86/X86InstrInfo.td332
-rw-r--r--lib/Target/X86/X86InstrMMX.td15
-rw-r--r--lib/Target/X86/X86InstrSSE.td208
-rw-r--r--lib/Target/X86/X86InstrSVM.td18
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td136
-rw-r--r--lib/Target/X86/X86InstrSystem.td76
-rw-r--r--lib/Target/X86/X86InstrTSX.td7
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp43
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp54
-rw-r--r--lib/Target/X86/X86RegisterInfo.h3
-rw-r--r--lib/Target/X86/X86RegisterInfo.td82
-rw-r--r--lib/Target/X86/X86SchedHaswell.td10
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td9
-rw-r--r--lib/Target/X86/X86Schedule.td13
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp3
-rw-r--r--lib/Target/X86/X86Subtarget.h23
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp6
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h3
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp97
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp15
54 files changed, 3371 insertions, 964 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 263eb5e..ad83d97 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
@@ -830,6 +831,18 @@ struct X86Operand : public MCParsedAsmOperand {
return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
}
+ bool isMemVZ32() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+ bool isMemVZ64() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+ getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+ }
+
+ bool isMem512() const {
+ return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+ }
bool isAbsMem() const {
return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
@@ -890,6 +903,16 @@ struct X86Operand : public MCParsedAsmOperand {
addMemOperands(Inst, N);
}
+ void addMemVZ32Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMemVZ64Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+ void addMem512Operands(MCInst &Inst, unsigned N) const {
+ addMemOperands(Inst, N);
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 7cb71f0..7e20151 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -53,7 +53,7 @@ endif()
add_llvm_target(X86CodeGen ${sources})
-add_dependencies(LLVMX86CodeGen intrinsics_gen)
+add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen)
add_subdirectory(AsmParser)
add_subdirectory(Disassembler)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ca71c4f..82af6fa 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -286,6 +286,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case TYPE_XMM256:
mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
return;
+ case TYPE_XMM512:
+ mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4)));
+ return;
case TYPE_REL8:
isBranch = true;
pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
@@ -443,6 +446,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
EA_BASES_64BIT
REGS_XMM
REGS_YMM
+ REGS_ZMM
#undef ENTRY
}
} else {
@@ -565,6 +569,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM64:
case TYPE_XMM128:
case TYPE_XMM256:
+ case TYPE_XMM512:
case TYPE_DEBUGREG:
case TYPE_CONTROLREG:
return translateRMRegister(mcInst, insn);
@@ -683,6 +688,15 @@ static bool translateInstruction(MCInst &mcInst,
}
mcInst.setOpcode(insn.instructionID);
+ // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+ // prefix bytes should be disassembled as xrelease and xacquire then set the
+ // opcode to those instead of the rep and repne opcodes.
+ if (insn.xAcquireRelease) {
+ if(mcInst.getOpcode() == X86::REP_PREFIX)
+ mcInst.setOpcode(X86::XRELEASE_PREFIX);
+ else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+ mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+ }
int index;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index e40edba..bb195ee 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -328,6 +328,27 @@ static int readPrefixes(struct InternalInstruction* insn) {
break;
if (lookAtByte(insn, &nextByte))
return -1;
+ /*
+ * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ * met:
+ * - it is followed by a LOCK (0xf0) prefix
+ * - it is followed by an xchg instruction
+ * then it should be disassembled as a xacquire/xrelease not repne/rep.
+ */
+ if ((byte == 0xf2 || byte == 0xf3) &&
+ ((nextByte == 0xf0) |
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+ insn->xAcquireRelease = TRUE;
+ /*
+ * Also if the byte is 0xf3, and the following condition is met:
+ * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ * "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ * then it should be disassembled as an xrelease not rep.
+ */
+ if (byte == 0xf3 &&
+ (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7))
+ insn->xAcquireRelease = TRUE;
if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
if (consumeByte(insn, &nextByte))
return -1;
@@ -1234,6 +1255,8 @@ static int readModRM(struct InternalInstruction* insn) {
return prefix##_EAX + index; \
case TYPE_R64: \
return prefix##_RAX + index; \
+ case TYPE_XMM512: \
+ return prefix##_ZMM0 + index; \
case TYPE_XMM256: \
return prefix##_YMM0 + index; \
case TYPE_XMM128: \
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 407ead3..dcb6aad 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -219,7 +219,23 @@ extern "C" {
ENTRY(XMM12) \
ENTRY(XMM13) \
ENTRY(XMM14) \
- ENTRY(XMM15)
+ ENTRY(XMM15) \
+ ENTRY(XMM16) \
+ ENTRY(XMM17) \
+ ENTRY(XMM18) \
+ ENTRY(XMM19) \
+ ENTRY(XMM20) \
+ ENTRY(XMM21) \
+ ENTRY(XMM22) \
+ ENTRY(XMM23) \
+ ENTRY(XMM24) \
+ ENTRY(XMM25) \
+ ENTRY(XMM26) \
+ ENTRY(XMM27) \
+ ENTRY(XMM28) \
+ ENTRY(XMM29) \
+ ENTRY(XMM30) \
+ ENTRY(XMM31)
#define REGS_YMM \
ENTRY(YMM0) \
@@ -237,7 +253,57 @@ extern "C" {
ENTRY(YMM12) \
ENTRY(YMM13) \
ENTRY(YMM14) \
- ENTRY(YMM15)
+ ENTRY(YMM15) \
+ ENTRY(YMM16) \
+ ENTRY(YMM17) \
+ ENTRY(YMM18) \
+ ENTRY(YMM19) \
+ ENTRY(YMM20) \
+ ENTRY(YMM21) \
+ ENTRY(YMM22) \
+ ENTRY(YMM23) \
+ ENTRY(YMM24) \
+ ENTRY(YMM25) \
+ ENTRY(YMM26) \
+ ENTRY(YMM27) \
+ ENTRY(YMM28) \
+ ENTRY(YMM29) \
+ ENTRY(YMM30) \
+ ENTRY(YMM31)
+
+#define REGS_ZMM \
+ ENTRY(ZMM0) \
+ ENTRY(ZMM1) \
+ ENTRY(ZMM2) \
+ ENTRY(ZMM3) \
+ ENTRY(ZMM4) \
+ ENTRY(ZMM5) \
+ ENTRY(ZMM6) \
+ ENTRY(ZMM7) \
+ ENTRY(ZMM8) \
+ ENTRY(ZMM9) \
+ ENTRY(ZMM10) \
+ ENTRY(ZMM11) \
+ ENTRY(ZMM12) \
+ ENTRY(ZMM13) \
+ ENTRY(ZMM14) \
+ ENTRY(ZMM15) \
+ ENTRY(ZMM16) \
+ ENTRY(ZMM17) \
+ ENTRY(ZMM18) \
+ ENTRY(ZMM19) \
+ ENTRY(ZMM20) \
+ ENTRY(ZMM21) \
+ ENTRY(ZMM22) \
+ ENTRY(ZMM23) \
+ ENTRY(ZMM24) \
+ ENTRY(ZMM25) \
+ ENTRY(ZMM26) \
+ ENTRY(ZMM27) \
+ ENTRY(ZMM28) \
+ ENTRY(ZMM29) \
+ ENTRY(ZMM30) \
+ ENTRY(ZMM31)
#define REGS_SEGMENT \
ENTRY(ES) \
@@ -285,6 +351,7 @@ extern "C" {
REGS_MMX \
REGS_XMM \
REGS_YMM \
+ REGS_ZMM \
REGS_SEGMENT \
REGS_DEBUG \
REGS_CONTROL \
@@ -319,6 +386,7 @@ typedef enum {
ALL_EA_BASES
REGS_XMM
REGS_YMM
+ REGS_ZMM
#undef ENTRY
SIB_INDEX_max
} SIBIndex;
@@ -457,6 +525,8 @@ struct InternalInstruction {
uint64_t necessaryPrefixLocation;
/* The segment override type */
SegmentOverride segmentOverride;
+ /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
+ BOOL xAcquireRelease;
/* Sizes of various critical pieces of data, in bytes */
uint8_t registerSize;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 23dfe4b..d291441 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -116,8 +116,106 @@ enum attributeBits {
ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\
ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\
ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \
- ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize")
-
+ ENUM_ENTRY(IC_VEX_L_W, 3, "requires VEX, L and W") \
+ ENUM_ENTRY(IC_VEX_L_W_XS, 4, "requires VEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_XD, 4, "requires VEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 4, "requires VEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \
+ ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \
+ ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \
+ ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \
+ ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \
+ ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \
+ ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \
+ ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, W and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+ ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \
+ ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \
+ ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and OpSize")
#define ENUM_ENTRY(n, r, d) n,
typedef enum {
@@ -224,6 +322,7 @@ struct ContextDecision {
ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \
+ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \
ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \
ENUM_ENTRY(ENCODING_CW, "2-byte") \
ENUM_ENTRY(ENCODING_CD, "4-byte") \
@@ -321,6 +420,9 @@ struct ContextDecision {
ENUM_ENTRY(TYPE_XMM64, "8-byte") \
ENUM_ENTRY(TYPE_XMM128, "16-byte") \
ENUM_ENTRY(TYPE_XMM256, "32-byte") \
+ ENUM_ENTRY(TYPE_XMM512, "64-byte") \
+ ENUM_ENTRY(TYPE_VK8, "8-bit") \
+ ENUM_ENTRY(TYPE_VK16, "16-bit") \
ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \
ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index e357710..b9d0082 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// Try to print any aliases first.
if (!printAliasInstr(MI, OS))
printInstruction(MI, OS);
-
+
// Next always print the annotation.
printAnnotation(OS, Annot);
@@ -139,8 +139,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
int64_t Address;
if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
+ O << formatHex((uint64_t)Address);
}
else {
// Otherwise, just print the expression.
@@ -159,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
O << markup("<imm:")
<< '$' << formatImm((int64_t)Op.getImm())
<< markup(">");
-
+
if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
*CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
-
+
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
O << markup("<imm:")
@@ -177,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
const MCOperand &IndexReg = MI->getOperand(Op+2);
const MCOperand &DispSpec = MI->getOperand(Op+3);
const MCOperand &SegReg = MI->getOperand(Op+4);
-
+
O << markup("<mem:");
// If this has a segment register, print it.
@@ -185,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
printOperand(MI, Op+4, O);
O << ':';
}
-
+
if (DispSpec.isImm()) {
int64_t DispVal = DispSpec.getImm();
if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
@@ -194,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
O << *DispSpec.getExpr();
}
-
+
if (IndexReg.getReg() || BaseReg.getReg()) {
O << '(';
if (BaseReg.getReg())
printOperand(MI, Op, O);
-
+
if (IndexReg.getReg()) {
O << ',';
printOperand(MI, Op+2, O);
unsigned ScaleVal = MI->getOperand(Op+1).getImm();
if (ScaleVal != 1) {
O << ','
- << markup("<imm:")
+ << markup("<imm:")
<< ScaleVal // never printed in hex.
- << markup(">");
+ << markup(">");
}
}
O << ')';
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8e09183..8d05256 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -65,6 +65,9 @@ public:
void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
@@ -80,6 +83,9 @@ public:
void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
};
}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 141f4a4..9dfc9a9 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -119,7 +119,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isImm())
- O << Op.getImm();
+ O << formatImm(Op.getImm());
else {
assert(Op.isExpr() && "unknown pcrel immediate operand");
// If a symbolic branch target was added as a constant expression then print
@@ -127,8 +127,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
int64_t Address;
if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
- O << "0x";
- O.write_hex(Address);
+ O << formatHex((uint64_t)Address);
}
else {
// Otherwise, just print the expression.
@@ -137,18 +136,13 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
}
}
-static void PrintRegName(raw_ostream &O, StringRef RegName) {
- for (unsigned i = 0, e = RegName.size(); i != e; ++i)
- O << (char)toupper(RegName[i]);
-}
-
void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
- PrintRegName(O, getRegisterName(Op.getReg()));
+ printRegName(O, Op.getReg());
} else if (Op.isImm()) {
- O << Op.getImm();
+ O << formatImm((int64_t)Op.getImm());
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
O << *Op.getExpr();
@@ -200,7 +194,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
DispVal = -DispVal;
}
}
- O << DispVal;
+ O << formatImm(DispVal);
}
}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index bb769eb..45beeda 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -41,52 +41,60 @@ public:
void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "OPAQUE PTR ";
+ O << "opaque ptr ";
printMemReference(MI, OpNo, O);
}
void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "BYTE PTR ";
+ O << "byte ptr ";
printMemReference(MI, OpNo, O);
}
void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "WORD PTR ";
+ O << "word ptr ";
printMemReference(MI, OpNo, O);
}
void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "DWORD PTR ";
+ O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "QWORD PTR ";
+ O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XMMWORD PTR ";
+ O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "YMMWORD PTR ";
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "DWORD PTR ";
+ O << "dword ptr ";
printMemReference(MI, OpNo, O);
}
void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "QWORD PTR ";
+ O << "qword ptr ";
printMemReference(MI, OpNo, O);
}
void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XWORD PTR ";
+ O << "xword ptr ";
printMemReference(MI, OpNo, O);
}
void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "XMMWORD PTR ";
+ O << "xmmword ptr ";
printMemReference(MI, OpNo, O);
}
void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- O << "YMMWORD PTR ";
+ O << "ymmword ptr ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "zmmword ptr ";
printMemReference(MI, OpNo, O);
}
};
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index d8f7278..25d1af3 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -462,20 +462,54 @@ namespace X86II {
// prefix. Usually used for scalar instructions. Needed by disassembler.
VEX_LIG = 1U << 6,
+ // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+ // with following encoding:
+ // - 00 V128
+ // - 01 V256
+ // - 10 V512
+ // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+ // this will save 1 tsflag bit
+
+ // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+ // syntax support up to 32 512-bit register operands and up to 7 16-bit
+ // mask operands as well as source operand data swizzling/memory operand
+ // conversion, eviction hint, and rounding mode.
+ EVEX = 1U << 7,
+
+ // EVEX_K - Set if this instruction requires masking
+ EVEX_K = 1U << 8,
+
+ // EVEX_Z - Set if this instruction has EVEX.Z field set.
+ EVEX_Z = 1U << 9,
+
+ // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+ EVEX_L2 = 1U << 10,
+
+ // EVEX_B - Set if this instruction has EVEX.B field set.
+ EVEX_B = 1U << 11,
+
+ // EVEX_CD8E - compressed disp8 form, element-size
+ EVEX_CD8EShift = VEXShift + 12,
+ EVEX_CD8EMask = 3,
+
+ // EVEX_CD8V - compressed disp8 form, vector-width
+ EVEX_CD8VShift = EVEX_CD8EShift + 2,
+ EVEX_CD8VMask = 7,
+
/// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
/// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents
/// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
/// storing a classifier in the imm8 field. To simplify our implementation,
/// we handle this by storeing the classifier in the opcode field and using
/// this flag to indicate that the encoder should do the wacky 3DNow! thing.
- Has3DNow0F0FOpcode = 1U << 7,
+ Has3DNow0F0FOpcode = 1U << 17,
/// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
/// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
- MemOp4 = 1U << 8,
+ MemOp4 = 1U << 18,
/// XOP - Opcode prefix used by XOP instructions.
- XOP = 1U << 9
+ XOP = 1U << 19
};
@@ -533,12 +567,19 @@ namespace X86II {
unsigned CurOp = 0;
if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
++CurOp;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
- assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
// Special case for GATHER with 2 TIED_TO operands
// Skip the first 2 operands: dst, mask_wb
CurOp += 2;
- }
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ ++CurOp;
return CurOp;
}
@@ -569,12 +610,15 @@ namespace X86II {
case X86II::MRMSrcMem: {
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
if (HasMemOp4)
++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
-
+ if (HasEVEX_K)
+ ++FirstMemOp;// Skip the mask register
// FIXME: Maybe lea should have its own form? This is a horrible hack.
//if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
// Opcode == X86::LEA16r || Opcode == X86::LEA32r)
@@ -611,6 +655,14 @@ namespace X86II {
/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
/// higher) register? e.g. r8, xmm8, xmm13, etc.
inline bool isX86_64ExtendedReg(unsigned RegNo) {
+ if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+ (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+ (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+ (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+ return true;
+
switch (RegNo) {
default: break;
case X86::R8: case X86::R9: case X86::R10: case X86::R11:
@@ -621,16 +673,21 @@ namespace X86II {
case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
- case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
- case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
- case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
- case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
return true;
}
return false;
}
+
+ /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+ /// registers? e.g. zmm21, etc.
+ static inline bool is32ExtendedReg(unsigned RegNo) {
+ return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+ (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+ (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+ }
+
inline bool isX86_64NonExtLowByteReg(unsigned reg) {
return (reg == X86::SPL || reg == X86::BPL ||
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index de80dd8..b400b87 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -101,7 +101,18 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
} else {
switch ((unsigned)Fixup.getKind()) {
default: llvm_unreachable("invalid fixup kind!");
- case FK_Data_8: Type = ELF::R_X86_64_64; break;
+ case FK_Data_8:
+ switch (Modifier) {
+ default:
+ llvm_unreachable("Unimplemented");
+ case MCSymbolRefExpr::VK_None:
+ Type = ELF::R_X86_64_64;
+ break;
+ case MCSymbolRefExpr::VK_DTPOFF:
+ Type = ELF::R_X86_64_DTPOFF64;
+ break;
+ }
+ break;
case X86::reloc_signed_4byte:
switch (Modifier) {
default:
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 016af71..8515879 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -53,7 +53,7 @@ public:
}
unsigned GetX86RegNum(const MCOperand &MO) const {
- return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
}
// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -77,6 +77,14 @@ public:
return (~SrcRegNum) & 0xf;
}
+ unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+ "Invalid mask register as write-mask!");
+ unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+ return MaskRegNum;
+ }
+
void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
OS << (char)C;
++CurByte;
@@ -152,6 +160,52 @@ static bool isDisp8(int Value) {
return Value == (signed char)Value;
}
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+ assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) &&
+ "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+ unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask;
+ unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask;
+
+ if (CD8V == 0 && CD8E == 0) {
+ CValue = Value;
+ return isDisp8(Value);
+ }
+
+ unsigned MemObjSize = 1U << CD8E;
+ if (CD8V & 4) {
+ // Fixed vector length
+ MemObjSize *= 1U << (CD8V & 0x3);
+ } else {
+ // Modified vector length
+ bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B;
+ if (!EVEX_b) {
+ unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0;
+ EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0;
+ assert(EVEX_LL < 3 && "");
+
+ unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize;
+ NumElems /= 1U << (CD8V & 0x3);
+
+ MemObjSize *= NumElems;
+ }
+ }
+
+ unsigned MemObjMask = MemObjSize - 1;
+ assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size.");
+
+ if (Value & MemObjMask) // Unaligned offset
+ return false;
+ Value /= MemObjSize;
+ bool Ret = (Value == (signed char)Value);
+
+ if (Ret)
+ CValue = Value;
+ return Ret;
+}
+
/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
/// in an instruction with the specified TSFlags.
static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
@@ -318,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
unsigned BaseReg = Base.getReg();
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
// Handle %rip relative addressing.
if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
@@ -378,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
- if (Disp.isImm() && isDisp8(Disp.getImm())) {
- EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
- return;
+ if (Disp.isImm()) {
+ if (!HasEVEX && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+ // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+ // 32-bit displacement.
+ int CDisp8 = 0;
+ if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ CDisp8 - Disp.getImm());
+ return;
+ }
}
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
@@ -397,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
bool ForceDisp32 = false;
bool ForceDisp8 = false;
+ int CDisp8 = 0;
+ int ImmOffset = 0;
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
@@ -412,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
- } else if (isDisp8(Disp.getImm())) {
+ } else if (!HasEVEX && isDisp8(Disp.getImm())) {
// Emit the disp8 encoding.
EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ ImmOffset = CDisp8 - Disp.getImm();
} else {
// Emit the normal disp32 encoding.
EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
@@ -445,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// Do we need to output a displacement?
if (ForceDisp8)
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
else if (ForceDisp32 || Disp.getImm() != 0)
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
CurByte, OS, Fixups);
@@ -457,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
int MemOperand, const MCInst &MI,
const MCInstrDesc &Desc,
raw_ostream &OS) const {
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
@@ -468,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0: Same as REX_R=1 (64 bit mode only)
//
unsigned char VEX_R = 0x1;
+ unsigned char EVEX_R2 = 0x1;
// VEX_X: equivalent to REX.X, only used when a
// register is used for index in SIB Byte.
@@ -504,6 +580,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// VEX_4V (VEX vvvv field): a register specifier
// (in 1's complement form) or 1111 if unused.
unsigned char VEX_4V = 0xf;
+ unsigned char EVEX_V2 = 0x1;
// VEX_L (Vector Length):
//
@@ -511,6 +588,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 1: 256-bit vector
//
unsigned char VEX_L = 0;
+ unsigned char EVEX_L2 = 0;
// VEX_PP: opcode extension providing equivalent
// functionality of a SIMD prefix
@@ -522,6 +600,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
//
unsigned char VEX_PP = 0;
+ // EVEX_U
+ unsigned char EVEX_U = 1; // Always '1' so far
+
+ // EVEX_z
+ unsigned char EVEX_z = 0;
+
+ // EVEX_b
+ unsigned char EVEX_b = 0;
+
+ // EVEX_aaa
+ unsigned char EVEX_aaa = 0;
+
// Encode the operand size opcode prefix as needed.
if (TSFlags & X86II::OpSize)
VEX_PP = 0x01;
@@ -534,6 +624,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
VEX_L = 1;
+ if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+ EVEX_L2 = 1;
+
+ if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
+ EVEX_z = 1;
+
+ if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+ EVEX_b = 1;
switch (TSFlags & X86II::Op0Mask) {
default: llvm_unreachable("Invalid prefix!");
@@ -580,12 +678,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned CurOp = 0;
if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
++CurOp;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
- assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ // Special case for AVX-512 GATHER with 2 TIED_TO operands
+ // Skip the first 2 operands: dst, mask_wb
+ CurOp += 2;
+ else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
// Special case for GATHER with 2 TIED_TO operands
// Skip the first 2 operands: dst, mask_wb
CurOp += 2;
- }
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ // SCATTER
+ ++CurOp;
switch (TSFlags & X86II::FormMask) {
case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
@@ -595,18 +700,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// MemAddr, src1(VEX_4V), src2(ModR/M)
// MemAddr, src1(ModR/M), imm8
//
- if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrBaseReg).getReg()))
VEX_B = 0x0;
- if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
VEX_X = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
+
+ CurOp += X86::AddrNumOperands;
- CurOp = X86::AddrNumOperands;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
const MCOperand &MO = MI.getOperand(CurOp);
- if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
- VEX_R = 0x0;
+ if (MO.isReg()) {
+ if (X86II::isX86_64ExtendedReg(MO.getReg()))
+ VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MO.getReg()))
+ EVEX_R2 = 0x0;
+ }
break;
}
case X86II::MRMSrcMem:
@@ -619,11 +741,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// FMA4:
// dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
// dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
- if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
- if (HasVEX_4V)
+ if (HasVEX_4V) {
VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -631,6 +763,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
VEX_X = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+ X86::AddrIndexReg).getReg()))
+ EVEX_V2 = 0x0;
if (HasVEX_4VOp3)
// Instruction format for 4VOp3:
@@ -647,8 +782,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// MRM[0-9]m instructions forms:
// MemAddr
// src1(VEX_4V), MemAddr
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, 0);
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ }
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
if (X86II::isX86_64ExtendedReg(
MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -669,16 +811,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
CurOp++;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (HasMemOp4) // Skip second register source (encoded in I8IMM)
CurOp++;
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
CurOp++;
if (HasVEX_4VOp3)
VEX_4V = getVEXRegisterEncoding(MI, CurOp);
@@ -690,13 +843,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// dst(ModR/M), src1(VEX_4V), src2(ModR/M)
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
CurOp++;
- if (HasVEX_4V)
- VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (HasVEX_4V) {
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+ }
if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_R = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_R2 = 0x0;
break;
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
@@ -704,9 +868,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
case X86II::MRM6r: case X86II::MRM7r:
// MRM0r-MRM7r instructions forms:
// dst(VEX_4V), src(ModR/M), imm8
- VEX_4V = getVEXRegisterEncoding(MI, 0);
- if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+ VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ EVEX_V2 = 0x0;
+ CurOp++;
+
+ if (HasEVEX_K)
+ EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+ if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
VEX_B = 0x0;
+ if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+ VEX_X = 0x0;
break;
default: // RawFrm
break;
@@ -715,29 +888,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// Emit segment override opcode prefix as needed.
EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
- // VEX opcode prefix can have 2 or 3 bytes
- //
- // 3 bytes:
- // +-----+ +--------------+ +-------------------+
- // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
- // +-----+ +--------------+ +-------------------+
- // 2 bytes:
- // +-----+ +-------------------+
- // | C5h | | R | vvvv | L | pp |
- // +-----+ +-------------------+
- //
- unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+ if (!HasEVEX) {
+ // VEX opcode prefix can have 2 or 3 bytes
+ //
+ // 3 bytes:
+ // +-----+ +--------------+ +-------------------+
+ // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+ // +-----+ +--------------+ +-------------------+
+ // 2 bytes:
+ // +-----+ +-------------------+
+ // | C5h | | R | vvvv | L | pp |
+ // +-----+ +-------------------+
+ //
+ unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
- if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
- EmitByte(0xC5, CurByte, OS);
- EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
- return;
- }
+ if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+ EmitByte(0xC5, CurByte, OS);
+ EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ return;
+ }
- // 3 byte VEX prefix
- EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
- EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
- EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ // 3 byte VEX prefix
+ EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+ EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ } else {
+ // EVEX opcode prefix can have 4 bytes
+ //
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+ // +-----+ +--------------+ +-------------------+ +------------------------+
+ assert((VEX_5M & 0x3) == VEX_5M
+ && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ VEX_5M &= 0x3;
+
+ EmitByte(0x62, CurByte, OS);
+ EmitByte((VEX_R << 7) |
+ (VEX_X << 6) |
+ (VEX_B << 5) |
+ (EVEX_R2 << 4) |
+ VEX_5M, CurByte, OS);
+ EmitByte((VEX_W << 7) |
+ (VEX_4V << 3) |
+ (EVEX_U << 2) |
+ VEX_PP, CurByte, OS);
+ EmitByte((EVEX_z << 7) |
+ (EVEX_L2 << 6) |
+ (VEX_L << 5) |
+ (EVEX_b << 4) |
+ (EVEX_V2 << 3) |
+ EVEX_aaa, CurByte, OS);
+ }
}
/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
@@ -1007,6 +1209,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
const unsigned MemOp4_I8IMMOperand = 2;
+ // It uses the EVEX.aaa field?
+ bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+ bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+
// Determine where the memory operand starts, if present.
int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
if (MemoryOperand != -1) MemoryOperand += CurOp;
@@ -1057,6 +1263,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + 1;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1069,6 +1278,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + X86::AddrNumOperands;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1082,6 +1294,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
EmitByte(BaseOpcode, CurByte, OS);
SrcRegNum = CurOp + 1;
+ if (HasEVEX_K) // Skip writemask
+ SrcRegNum++;
+
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
@@ -1100,6 +1315,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRMSrcMem: {
int AddrOperands = X86::AddrNumOperands;
unsigned FirstMemOp = CurOp+1;
+
+ if (HasEVEX_K) { // Skip writemask
+ ++AddrOperands;
+ ++FirstMemOp;
+ }
+
if (HasVEX_4V) {
++AddrOperands;
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 496b704..adfa7fa 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -517,37 +517,6 @@ to <2 x i64> ops being so bad.
//===---------------------------------------------------------------------===//
-'select' on vectors and scalars could be a whole lot better. We currently
-lower them to conditional branches. On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
- ucomisd %xmm0, %xmm1
- ja LBB1_2 # entry
-LBB1_1: # entry
- movapd %xmm3, %xmm2
-LBB1_2: # entry
- movapd %xmm2, %xmm0
- ret
-
-instead of:
-
-_test:
- cmpltsd %xmm1, %xmm0
- andpd %xmm0, %xmm2
- andnpd %xmm3, %xmm0
- orpd %xmm2, %xmm0
- ret
-
-For unpredictable branches, the later is much more efficient. This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
LLVM currently generates stack realignment code, when it is not necessary
needed. The problem is that we need to know about stack alignment too early,
before RA runs.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index c865500..461ea9b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -86,6 +86,19 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
+def FeatureAVX512 : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512",
+ "Enable AVX-512 instructions",
+ [FeatureAVX2]>;
+def FeatureERI : SubtargetFeature<"avx-512-eri", "HasERI", "true",
+ "Enable AVX-512 Exponential and Reciprocal Instructions",
+ [FeatureAVX512]>;
+def FeatureCDI : SubtargetFeature<"avx-512-cdi", "HasCDI", "true",
+ "Enable AVX-512 Conflict Detection Instructions",
+ [FeatureAVX512]>;
+def FeaturePFI : SubtargetFeature<"avx-512-pfi", "HasPFI", "true",
+ "Enable AVX-512 PreFetch Instructions",
+ [FeatureAVX512]>;
+
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
@@ -227,6 +240,15 @@ def : ProcessorModel<"core-avx2", HaswellModel,
FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
FeatureHLE]>;
+// KNL
+// FIXME: define KNL model
+def : ProcessorModel<"knl", HaswellModel,
+ [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
+ FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+ FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+ FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+ FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>;
+
def : Proc<"k6", [FeatureMMX]>;
def : Proc<"k6-2", [Feature3DNow]>;
def : Proc<"k6-3", [Feature3DNow]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 6b228b0..9e0ab82 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -702,48 +702,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
}
}
-MachineLocation
-X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
- MachineLocation Location;
- assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
- // Frame address. Currently handles register +- offset only.
-
- if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
- Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
- else {
- DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
- }
- return Location;
-}
-
-void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
- raw_ostream &O) {
- // Only the target-dependent form of DBG_VALUE should get here.
- // Referencing the offset and metadata as NOps-2 and NOps-1 is
- // probably portable to other targets; frame pointer location is not.
- unsigned NOps = MI->getNumOperands();
- assert(NOps==7);
- O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
- // cast away const; DIetc do not take const operands for some reason.
- DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
- if (V.getContext().isSubprogram())
- O << DISubprogram(V.getContext()).getDisplayName() << ":";
- O << V.getName();
- O << " <- ";
- // Frame address. Currently handles register +- offset only.
- O << '[';
- if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
- printOperand(MI, 0, O);
- else
- O << "undef";
- O << '+'; printOperand(MI, 3, O);
- O << ']';
- O << "+";
- printOperand(MI, NOps-2, O);
-}
-
-
-
//===----------------------------------------------------------------------===//
// Target Registry Stuff
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index bc7496b..6eed5ce 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -67,11 +67,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
unsigned AsmVariant = 1);
virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
-
- void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
- virtual MachineLocation
- getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 9eafbd5..38e2591 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+ // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+ // can only be used by ABI non-compliant code. This vector type is only
+ // supported while using the AVX-512 target feature.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
// MMX vector types are always returned in MM0. If the target doesn't have
// MM0, it doesn't support these vector types.
CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
@@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
// i32, i64 in the standard way
CCDelegateTo<RetCC_X86Common>
]>;
@@ -156,6 +166,11 @@ def RetCC_X86_32 : CallingConv<[
def RetCC_X86_64 : CallingConv<[
// HiPE uses RetCC_X86_64_HiPE
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+ // Handle explicit CC selection
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
@@ -208,10 +223,15 @@ def CC_X86_64_C : CallingConv<[
// fixed arguments to vararg functions are supposed to be passed in
// registers. Actually modeling that would be a lot of work, though.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasAVX()",
+ CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
YMM4, YMM5, YMM6, YMM7]>>>>,
+ // The first 8 512-bit vector arguments are passed in ZMM registers.
+ CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
// Integer/FP values get stored in stack slots that are 8 bytes in size and
// 8-byte aligned if there are no more registers to hold them.
CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
@@ -225,7 +245,11 @@ def CC_X86_64_C : CallingConv<[
// 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCAssignToStack<32, 32>>
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64],
+ CCAssignToStack<64, 64>>
]>;
// Calling convention used on Win64
@@ -246,6 +270,9 @@ def CC_X86_Win64_C : CallingConv<[
// 256 bit vectors are passed by pointer
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+ // 512 bit vectors are passed by pointer
+ CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
// The first 4 MMX vector arguments are passed in GPRs.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
@@ -340,7 +367,7 @@ def CC_X86_32_Common : CallingConv<[
// The first 4 AVX 256-bit vector arguments are passed in YMM registers.
CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCIfSubtarget<"hasAVX()",
+ CCIfSubtarget<"hasFp256()",
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
// Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
@@ -464,6 +491,10 @@ def CC_Intel_OCL_BI : CallingConv<[
CCIfType<[v8f32, v4f64, v8i32, v4i64],
CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64, v16i32, v8i64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>,
CCDelegateTo<CC_X86_32_C>
@@ -489,6 +520,8 @@ def CC_X86_32 : CallingConv<[
def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+ CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -528,6 +561,10 @@ def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
R13, R14, R15,
(sequence "YMM%u", 6, 15))>;
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+ R12, R13, R14, R15,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
//Standard C + XMM 8-15
def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
(sequence "XMM%u", 8, 15))>;
@@ -535,3 +572,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
//Standard C + YMM 8-15
def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
(sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 295a577..5bc3420 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -79,8 +79,10 @@ private:
bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
- bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
- bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
+ bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
+ bool Aligned = false);
+ bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
+ bool Aligned = false);
bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
unsigned &ResultReg);
@@ -233,7 +235,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
/// and a displacement offset, or a GlobalAddress,
/// i.e. V. Return true if it is possible.
bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
+X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
+ const X86AddressMode &AM, bool Aligned) {
// Get opcode and regclass of the output for the given store instruction.
unsigned Opc = 0;
switch (VT.getSimpleVT().SimpleTy) {
@@ -243,8 +246,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
// Mask out all but lowest bit.
unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
- TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
- Val = AndResult;
+ TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+ ValReg = AndResult;
}
// FALLTHROUGH, handling i1 as i8.
case MVT::i8: Opc = X86::MOV8mr; break;
@@ -260,26 +263,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
(Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
break;
case MVT::v4f32:
- Opc = X86::MOVAPSmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
- Opc = X86::MOVAPDmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
- Opc = X86::MOVDQAmr;
+ if (Aligned)
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ else
+ Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
}
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DL, TII.get(Opc)), AM).addReg(Val);
+ DL, TII.get(Opc)), AM).addReg(ValReg);
return true;
}
bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
- const X86AddressMode &AM) {
+ const X86AddressMode &AM, bool Aligned) {
// Handle 'null' like i32/i64 0.
if (isa<ConstantPointerNull>(Val))
Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
@@ -314,7 +326,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
if (ValReg == 0)
return false;
- return X86FastEmitStore(VT, ValReg, AM);
+ return X86FastEmitStore(VT, ValReg, AM, Aligned);
}
/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -688,6 +700,10 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
if (S->isAtomic())
return false;
+ unsigned SABIAlignment =
+ TD.getABITypeAlignment(S->getValueOperand()->getType());
+ bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
+
MVT VT;
if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
return false;
@@ -696,7 +712,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
if (!X86SelectAddress(I->getOperand(1), AM))
return false;
- return X86FastEmitStore(VT, I->getOperand(0), AM);
+ return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
}
/// X86SelectRet - Select and emit code to implement ret instructions.
@@ -712,10 +728,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
- CC != CallingConv::X86_FastCall)
+ CC != CallingConv::X86_FastCall &&
+ CC != CallingConv::X86_64_SysV)
return false;
- if (Subtarget->isTargetWin64())
+ if (Subtarget->isCallingConvWin64(CC))
return false;
// Don't handle popping bytes on return for now.
@@ -1376,10 +1393,37 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
// Generate the DIV/IDIV instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
- // Copy output register into result register.
- unsigned ResultReg = createResultReg(TypeEntry.RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
- TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg);
+ // For i8 remainder, we can't reference AH directly, as we'll end
+ // up with bogus copies like %R9B = COPY %AH. Reference AX
+ // instead to prevent AH references in a REX instruction.
+ //
+ // The current assumption of the fast register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
+ unsigned ResultReg = 0;
+ if ((I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem) &&
+ OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+ unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+ TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+ // Shift AX right by 8 bits instead of using AH.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri),
+ ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+ // Now reference the 8-bit subreg of the result.
+ ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+ /*Kill=*/true, X86::sub_8bit);
+ }
+ // Copy the result out of the physreg if we haven't already.
+ if (!ResultReg) {
+ ResultReg = createResultReg(TypeEntry.RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg)
+ .addReg(OpEntry.DivRemResultReg);
+ }
UpdateValueMap(I, ResultReg);
return true;
@@ -1678,9 +1722,6 @@ bool X86FastISel::FastLowerArguments() {
if (!FuncInfo.CanLowerReturn)
return false;
- if (Subtarget->isTargetWin64())
- return false;
-
const Function *F = FuncInfo.Fn;
if (F->isVarArg())
return false;
@@ -1688,7 +1729,10 @@ bool X86FastISel::FastLowerArguments() {
CallingConv::ID CC = F->getCallingConv();
if (CC != CallingConv::C)
return false;
-
+
+ if (Subtarget->isCallingConvWin64(CC))
+ return false;
+
if (!Subtarget->is64Bit())
return false;
@@ -1732,8 +1776,6 @@ bool X86FastISel::FastLowerArguments() {
const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
I != E; ++I, ++Idx) {
- if (I->use_empty())
- continue;
bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
@@ -1792,8 +1834,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Handle only C and fastcc calling conventions for now.
ImmutableCallSite CS(CI);
CallingConv::ID CC = CS.getCallingConv();
+ bool isWin64 = Subtarget->isCallingConvWin64(CC);
if (CC != CallingConv::C && CC != CallingConv::Fast &&
- CC != CallingConv::X86_FastCall)
+ CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 &&
+ CC != CallingConv::X86_64_SysV)
return false;
// fastcc with -tailcallopt is intended to provide a guaranteed
@@ -1807,7 +1851,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Don't know how to handle Win64 varargs yet. Nothing special needed for
// x86-32. Special handling for x86-64 is implemented.
- if (isVarArg && Subtarget->isTargetWin64())
+ if (isVarArg && isWin64)
return false;
// Fast-isel doesn't know about callee-pop yet.
@@ -1937,7 +1981,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
I->getParent()->getContext());
// Allocate shadow area for Win64
- if (Subtarget->isTargetWin64())
+ if (isWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
@@ -2053,7 +2097,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
X86::EBX).addReg(Base);
}
- if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+ if (Subtarget->is64Bit() && isVarArg && !isWin64) {
// Count the number of XMM registers allocated.
static const uint16_t XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -2122,7 +2166,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
if (Subtarget->isPICStyleGOT())
MIB.addReg(X86::EBX, RegState::Implicit);
- if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+ if (Subtarget->is64Bit() && isVarArg && !isWin64)
MIB.addReg(X86::AL, RegState::Implicit);
// Add implicit physical register uses to the call.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 8522c8c..48470da 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -115,9 +115,10 @@ namespace {
unsigned Mask = 0;
for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
E = MBB->livein_end(); I != E; ++I) {
- unsigned Reg = *I - X86::FP0;
- if (Reg < 8)
- Mask |= 1 << Reg;
+ unsigned Reg = *I;
+ if (Reg < X86::FP0 || Reg > X86::FP6)
+ continue;
+ Mask |= 1 << (Reg - X86::FP0);
}
return Mask;
}
@@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
.addExternalSymbol("_ftol2")
.addReg(X86::ST0, RegState::ImplicitKill)
+ .addReg(X86::ECX, RegState::ImplicitDefine)
.addReg(X86::EAX, RegState::Define | RegState::Implicit)
.addReg(X86::EDX, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 3061117..b994e67 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -307,7 +307,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
unsigned FramePtr) const {
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
- const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
// Add callee saved registers to move list.
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -360,7 +360,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
if (HasFP && FramePtr == Reg)
continue;
- unsigned DwarfReg = MRI.getDwarfRegNum(Reg, true);
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset));
}
}
@@ -914,11 +914,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
- // MSVC x64's __chkstk needs to adjust %rsp.
- // FIXME: %rax preserves the offset and should be available.
- if (isSPUpdateNeeded)
- emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
- UseLEA, TII, *RegInfo);
+ // MSVC x64's __chkstk does not adjust %rsp itself.
+ // It also does not clobber %rax so we can reuse it when adjusting %rsp.
+ if (isSPUpdateNeeded) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
+ .addReg(StackPtr)
+ .addReg(X86::RAX)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
if (isEAXAlive) {
// Restore EAX
@@ -1320,7 +1323,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
unsigned SlotSize = RegInfo->getSlotSize();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
if (TailCallReturnAddrDelta < 0) {
// create RETURNADDR area
@@ -1333,7 +1336,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
// }
// [EBP]
MFI->CreateFixedObject(-TailCallReturnAddrDelta,
- (-1U*SlotSize)+TailCallReturnAddrDelta, true);
+ TailCallReturnAddrDelta - SlotSize, true);
}
if (hasFP(MF)) {
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4ffffa1..9465420 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -141,10 +141,6 @@ namespace {
/// SelectionDAG operations.
///
class X86DAGToDAGISel : public SelectionDAGISel {
- /// X86Lowering - This object fully describes how to lower LLVM code to an
- /// X86-specific SelectionDAG.
- const X86TargetLowering &X86Lowering;
-
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
@@ -156,7 +152,6 @@ namespace {
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel),
- X86Lowering(*tm.getTargetLowering()),
Subtarget(&tm.getSubtarget<X86Subtarget>()),
OptForSize(false) {}
@@ -233,7 +228,8 @@ namespace {
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
- CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI->getPointerTy()) :
+ CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
+ getTargetLowering()->getPointerTy()) :
AM.Base_Reg;
Scale = getI8Imm(AM.Scale);
Index = AM.IndexReg;
@@ -504,8 +500,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// If the source and destination are SSE registers, then this is a legal
// conversion that should not be lowered.
- bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
- bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(getTargetLowering());
+ bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
if (SrcIsSSE && DstIsSSE)
continue;
@@ -1556,7 +1554,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
///
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
- return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
+ return CurDAG->getRegister(GlobalBaseReg,
+ getTargetLowering()->getPointerTy()).getNode();
}
SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
@@ -2532,6 +2531,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// Prevent use of AH in a REX instruction by referencing AX instead.
// Shift it down 8 bits.
+ //
+ // The current assumption of the register allocator is that isel
+ // won't generate explicit references to the GPR8_NOREX registers. If
+ // the allocator and/or the backend get enhanced to be more robust in
+ // that regard, this can be, and should be, removed.
if (HiReg == X86::AH && Subtarget->is64Bit() &&
!SDValue(Node, 1).use_empty()) {
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 346dfbb..9ffe29f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -58,17 +58,14 @@ STATISTIC(NumTailCalls, "Number of tail calls");
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
-/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
-/// sets things up to match to an AVX VEXTRACTF128 instruction or a
-/// simple subregister reference. Idx is an index in the 128 bits we
-/// want. It need not be aligned to a 128-bit bounday. That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
EVT VT = Vec.getValueType();
- assert(VT.is256BitVector() && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/128;
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
@@ -76,13 +73,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
if (Vec.getOpcode() == ISD::UNDEF)
return DAG.getUNDEF(ResultVT);
- // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
- // we can match to VEXTRACTF128.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
+ // This is the index of the first element of the vectorWidth-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
* ElemsPerChunk);
// If the input is a buildvector just emit a smaller one.
@@ -95,38 +91,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
VecIdx);
return Result;
+
+}
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
}
-/// Generate a DAG to put 128-bits into a vector > 128 bits. This
-/// sets things up to match to an AVX VINSERTF128 instruction or a
-/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit bounday. That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl) {
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
// Inserting UNDEF is Result
if (Vec.getOpcode() == ISD::UNDEF)
return Result;
-
EVT VT = Vec.getValueType();
- assert(VT.is128BitVector() && "Unexpected vector size!");
-
EVT ElVT = VT.getVectorElementType();
EVT ResultVT = Result.getValueType();
- // Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
+ // This is the index of the first element of the vectorWidth-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
* ElemsPerChunk);
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
VecIdx);
}
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit bounday. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
/// instructions. This is used because creating CONCAT_VECTOR nodes of
@@ -139,6 +168,13 @@ static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
}
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
bool is64Bit = Subtarget->is64Bit();
@@ -563,10 +599,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
- setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
- setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
- setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
- setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
if (Subtarget->is64Bit()) {
setExceptionPointerRegister(X86::RAX);
setExceptionSelectorRegister(X86::RDX);
@@ -586,10 +618,12 @@ void X86TargetLowering::resetOperationActions() {
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
- if (Subtarget->is64Bit()) {
+ if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
+ // TargetInfo::X86_64ABIBuiltinVaList
setOperationAction(ISD::VAARG , MVT::Other, Custom);
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
} else {
+ // TargetInfo::CharPtrBuiltinVaList
setOperationAction(ISD::VAARG , MVT::Other, Expand);
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
}
@@ -1000,7 +1034,7 @@ void X86TargetLowering::resetOperationActions() {
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
}
- if (Subtarget->hasSSE41()) {
+ if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
@@ -1263,6 +1297,150 @@ void X86TargetLowering::resetOperationActions() {
}
}
+ if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
+ addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v8f64, Legal);
+ setOperationAction(ISD::FMA, MVT::v16f32, Legal);
+ setOperationAction(ISD::SDIV, MVT::v16i32, Custom);
+
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+
+ setOperationAction(ISD::TRUNCATE, MVT::i1, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
+
+ setOperationAction(ISD::ADD, MVT::v8i64, Legal);
+ setOperationAction(ISD::ADD, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::SRL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v8i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::AND, MVT::v8i64, Legal);
+ setOperationAction(ISD::OR, MVT::v8i64, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i64, Legal);
+
+ // Custom lower several nodes.
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ // Extract subvector is special because the value type
+ // (result) is 256/128-bit but the source is 512-bit wide.
+ if (VT.is128BitVector() || VT.is256BitVector())
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ if (VT.getVectorElementType() == MVT::i1)
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+ // Do not attempt to custom lower other non-512-bit vectors
+ if (!VT.is512BitVector())
+ continue;
+
+ if (VT != MVT::v8i64) {
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v8i64);
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v8i64);
+ }
+ if ( EltSize >= 32) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ }
+ }
+ for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+
+ // Do not attempt to promote non-256-bit vectors
+ if (!VT.is512BitVector())
+ continue;
+
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v8i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+ }
+ }// has AVX-512
+
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1884,13 +2062,19 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
CC == CallingConv::HiPE);
}
+/// \brief Return true if the calling convention is a C calling convention.
+static bool IsCCallConvention(CallingConv::ID CC) {
+ return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
+ CC == CallingConv::X86_64_SysV);
+}
+
bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
return false;
CallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
- if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+ if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
return false;
return true;
@@ -1965,7 +2149,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MachineFrameInfo *MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget->is64Bit();
bool IsWindows = Subtarget->isTargetWindows();
- bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
@@ -1976,9 +2160,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (IsWin64) {
+ if (IsWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
@@ -2004,12 +2187,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = &X86::FR64RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
RC = &X86::VR256RegClass;
else if (RegVT.is128BitVector())
RC = &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
else
llvm_unreachable("Unknown argument type!");
@@ -2267,7 +2456,8 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
- MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(NewReturnAddrFI),
@@ -2279,10 +2469,10 @@ SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
- SDLoc &dl = CLI.DL;
- SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
- SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
- SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDLoc &dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
CallingConv::ID CallConv = CLI.CallConv;
@@ -2291,7 +2481,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
- bool IsWin64 = Subtarget->isTargetWin64();
+ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
bool IsWindows = Subtarget->isTargetWindows();
StructReturnType SR = callIsStructReturn(Outs);
bool IsSibcall = false;
@@ -2324,9 +2514,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (IsWin64) {
+ if (IsWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
@@ -2837,13 +3026,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG &DAG) const {
- if (!IsTailCallConvention(CalleeCC) &&
- CalleeCC != CallingConv::C)
+ if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
const MachineFunction &MF = DAG.getMachineFunction();
- const Function *CallerF = DAG.getMachineFunction().getFunction();
+ const Function *CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
@@ -2853,6 +3041,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
+ bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+ bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
@@ -2886,7 +3076,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
- if (Subtarget->isTargetWin64())
+ if (IsCalleeWin64 || IsCallerWin64)
return false;
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2961,9 +3151,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
getTargetMachine(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
- if (Subtarget->isTargetWin64()) {
+ if (IsCalleeWin64)
CCInfo.AllocateStack(32, 8);
- }
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
if (CCInfo.getNextStackOffset()) {
@@ -3135,7 +3324,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
- ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
@@ -3696,12 +3886,10 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
- i != (l+1)*NumLaneElts;
- i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
@@ -3735,11 +3923,10 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
- i != (l+1)*NumLaneElts; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
@@ -3780,12 +3967,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
- i != (l+1)*NumLaneElts;
- i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
@@ -3815,11 +4000,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
- for (unsigned l = 0; l != NumLanes; ++l) {
- for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
- i != (l+1)*NumLaneElts; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+ int BitI = Mask[l+i];
+ int BitI1 = Mask[l+i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (!isUndefOrEqual(BitI1, j))
@@ -4051,42 +4235,59 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
return true;
}
-/// isVEXTRACTF128Index - Return true if the specified
+/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-/// suitable for input to VEXTRACTF128.
-bool X86::isVEXTRACTF128Index(SDNode *N) {
+/// suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
return false;
- // The index should be aligned on a 128-bit boundary.
+ // The index should be aligned on a vecWidth-bit boundary.
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
MVT VT = N->getValueType(0).getSimpleVT();
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
- bool Result = (Index * ElSize) % 128 == 0;
+ bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
}
-/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
+/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
/// operand specifies a subvector insert that is suitable for input to
-/// VINSERTF128.
-bool X86::isVINSERTF128Index(SDNode *N) {
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
return false;
-
- // The index should be aligned on a 128-bit boundary.
+ // The index should be aligned on a vecWidth-bit boundary.
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
MVT VT = N->getValueType(0).getSimpleVT();
unsigned ElSize = VT.getVectorElementType().getSizeInBits();
- bool Result = (Index * ElSize) % 128 == 0;
+ bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
}
+bool X86::isVINSERT128Index(SDNode *N) {
+ return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+ return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+ return isVEXTRACTIndex(N, 256);
+}
+
/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
/// Handles 128-bit and 256-bit.
@@ -4190,12 +4391,10 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
return (Val - i) * EltSize;
}
-/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
-/// instructions.
-unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
- llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
+ llvm_unreachable("Illegal extract subvector for VEXTRACT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
@@ -4203,16 +4402,14 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
MVT ElVT = VecVT.getVectorElementType();
- unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
}
-/// getInsertVINSERTF128Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
-/// instructions.
-unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+ assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
- llvm_unreachable("Illegal insert subvector for VINSERTF128");
+ llvm_unreachable("Illegal insert subvector for VINSERT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
@@ -4220,10 +4417,38 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
MVT VecVT = N->getValueType(0).getSimpleVT();
MVT ElVT = VecVT.getVectorElementType();
- unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
}
+/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+ return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// getInsertVINSERT128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 128);
+}
+
+/// getInsertVINSERT256Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+ return getInsertVINSERTImmediate(N, 256);
+}
+
/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
/// Handles 256-bit.
@@ -5063,10 +5288,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- Mask.push_back(EltNo);
-
+ SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
@@ -5184,7 +5406,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getValueType().getSimpleVT();
SDLoc dl(Op);
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
SDValue Ld;
@@ -5240,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
// The scalar_to_vector node and the suspected
// load node must have exactly one user.
// Constants may have multiple users.
- if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+
+ // AVX-512 has register version of the broadcast
+ bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+ Ld.getValueType().getSizeInBits() >= 32;
+ if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+ !hasRegVer))
return SDValue();
break;
}
}
- bool Is256 = VT.is256BitVector();
+ bool IsGE256 = (VT.getSizeInBits() >= 256);
// Handle the broadcasting a single constant scalar from the constant pool
// into a vector. On Sandybridge it is still better to load a constant vector
@@ -5256,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
assert(!CVT.isVector() && "Must not broadcast a vector type");
unsigned ScalarSize = CVT.getSizeInBits();
- if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
const Constant *C = 0;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
@@ -5280,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget->hasInt256() &&
- (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+ (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The scalar source must be a normal load.
if (!IsLoad)
return SDValue();
- if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
@@ -5375,6 +5602,108 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
return NV;
}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+ EVT VT = Op.getValueType();
+ assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+ "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+ SDLoc dl(Op);
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+ Ops, VT.getVectorNumElements());
+ }
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+ Ops, VT.getVectorNumElements());
+ }
+
+ bool AllContants = true;
+ uint64_t Immediate = 0;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.getOpcode() == ISD::UNDEF)
+ continue;
+ if (!isa<ConstantSDNode>(In)) {
+ AllContants = false;
+ break;
+ }
+ if (cast<ConstantSDNode>(In)->getZExtValue())
+ Immediate |= (1ULL << idx);
+ }
+
+ if (AllContants) {
+ SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
+ DAG.getConstant(Immediate, MVT::i16));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
+ DAG.getIntPtrConstant(0));
+ }
+
+ if (!isSplatVector(Op.getNode()))
+ llvm_unreachable("Unsupported predicate operation");
+
+ SDValue In = Op.getOperand(0);
+ SDValue EFLAGS, X86CC;
+ if (In.getOpcode() == ISD::SETCC) {
+ SDValue Op0 = In.getOperand(0);
+ SDValue Op1 = In.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
+ bool isFP = Op1.getValueType().isFloatingPoint();
+ unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+
+ assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
+
+ X86CC = DAG.getConstant(X86CCVal, MVT::i8);
+ EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ } else if (In.getOpcode() == X86ISD::SETCC) {
+ X86CC = In.getOperand(0);
+ EFLAGS = In.getOperand(1);
+ } else {
+ // The algorithm:
+ // Bit1 = In & 0x1
+ // if (Bit1 != 0)
+ // ZF = 0
+ // else
+ // ZF = 1
+ // if (ZF == 0)
+ // res = allOnes ### CMOVNE -1, %res
+ // else
+ // res = allZero
+ MVT InVT = In.getValueType().getSimpleVT();
+ SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
+ EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
+ X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ }
+
+ if (VT == MVT::v16i1) {
+ SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
+ SDValue Cst0 = DAG.getConstant(0, MVT::i16);
+ SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
+ Cst0, Cst1, X86CC, EFLAGS);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+ }
+
+ if (VT == MVT::v8i1) {
+ SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
+ SDValue Cst0 = DAG.getConstant(0, MVT::i32);
+ SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
+ Cst0, Cst1, X86CC, EFLAGS);
+ CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+ }
+ llvm_unreachable("Unsupported predicate operation");
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -5383,6 +5712,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
+ // Generate vectors for predicate vectors.
+ if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+ return LowerBUILD_VECTORvXi1(Op, DAG);
+
// Vectors containing all zeros can be matched by pxor and xorps later
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
@@ -5713,19 +6046,22 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getValueType().getSimpleVT();
- assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
+ assert((ResVT.is256BitVector() ||
+ ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
+ if(ResVT.is256BitVector())
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
- return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+ return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 2);
- // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
@@ -7195,6 +7531,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDLoc dl(Op);
if (!isa<ConstantSDNode>(Op.getOperand(1)))
return SDValue();
@@ -7203,17 +7540,19 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
- if (VecVT.is256BitVector()) {
- SDLoc dl(Op.getNode());
- unsigned NumElems = VecVT.getVectorNumElements();
+ if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
SDValue Idx = Op.getOperand(1);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ EVT EltVT = VecVT.getVectorElementType();
+
+ unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
- if (IdxVal >= NumElems/2)
- IdxVal -= NumElems/2;
+ //if (IdxVal >= NumElems/2)
+ // IdxVal -= NumElems/2;
+ IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getConstant(IdxVal, MVT::i32));
}
@@ -7227,7 +7566,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
}
MVT VT = Op.getValueType().getSimpleVT();
- SDLoc dl(Op);
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
SDValue Vec = Op.getOperand(0);
@@ -7348,19 +7686,20 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// If this is a 256-bit vector result, first extract the 128-bit vector,
// insert the element into the extracted half and then place it back.
- if (VT.is256BitVector()) {
+ if (VT.is256BitVector() || VT.is512BitVector()) {
if (!isa<ConstantSDNode>(N2))
return SDValue();
// Get the desired 128-bit vector half.
- unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired half.
- bool Upper = IdxVal >= NumElems/2;
+ unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
+ unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
+ DAG.getConstant(IdxIn128, MVT::i32));
// Insert the changed part back to the 256-bit vector
return Insert128BitVector(N0, V, IdxVal, DAG, dl);
@@ -7393,9 +7732,10 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
+ unsigned SizeFactor = OpVT.getSizeInBits()/128;
EVT VT128 = EVT::getVectorVT(*Context,
OpVT.getVectorElementType(),
- OpVT.getVectorNumElements() / 2);
+ OpVT.getVectorNumElements() / SizeFactor);
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
@@ -7418,16 +7758,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// upper bits of a vector.
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue Idx = Op.getNode()->getOperand(1);
+ SDLoc dl(Op);
+ SDValue In = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ EVT ResVT = Op.getValueType();
+ EVT InVT = In.getValueType();
- if (Op.getNode()->getValueType(0).is128BitVector() &&
- Vec.getNode()->getValueType(0).is256BitVector() &&
+ if (Subtarget->hasFp256()) {
+ if (ResVT.is128BitVector() &&
+ (InVT.is256BitVector() || InVT.is512BitVector()) &&
isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Extract128BitVector(Vec, IdxVal, DAG, dl);
+ return Extract128BitVector(In, IdxVal, DAG, dl);
+ }
+ if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ return Extract256BitVector(In, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7444,12 +7790,20 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SDValue SubVec = Op.getNode()->getOperand(1);
SDValue Idx = Op.getNode()->getOperand(2);
- if (Op.getNode()->getValueType(0).is256BitVector() &&
+ if ((Op.getNode()->getValueType(0).is256BitVector() ||
+ Op.getNode()->getValueType(0).is512BitVector()) &&
SubVec.getNode()->getValueType(0).is128BitVector() &&
isa<ConstantSDNode>(Idx)) {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}
+
+ if (Op.getNode()->getValueType(0).is512BitVector() &&
+ SubVec.getNode()->getValueType(0).is256BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ }
}
return SDValue();
}
@@ -8100,7 +8454,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
- const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
@@ -8837,7 +9191,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
Opnds.push_back(N->getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
- SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+ SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all OR'd operands.
if (I->getOpcode() == ISD::OR) {
Opnds.push_back(I->getOperand(0));
@@ -9236,6 +9590,51 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
return SDValue();
}
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1) {
+ unsigned SSECC;
+ bool Swap = false;
+
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true; // Fallthrough
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true; // Fallthrough
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ return SSECC;
+}
+
// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
// ones, and then concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
@@ -9283,43 +9682,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
- unsigned SSECC;
- bool Swap = false;
-
- // SSE Condition code mapping:
- // 0 - EQ
- // 1 - LT
- // 2 - LE
- // 3 - UNORD
- // 4 - NEQ
- // 5 - NLT
- // 6 - NLE
- // 7 - ORD
- switch (SetCCOpcode) {
- default: llvm_unreachable("Unexpected SETCC condition");
- case ISD::SETOEQ:
- case ISD::SETEQ: SSECC = 0; break;
- case ISD::SETOGT:
- case ISD::SETGT: Swap = true; // Fallthrough
- case ISD::SETLT:
- case ISD::SETOLT: SSECC = 1; break;
- case ISD::SETOGE:
- case ISD::SETGE: Swap = true; // Fallthrough
- case ISD::SETLE:
- case ISD::SETOLE: SSECC = 2; break;
- case ISD::SETUO: SSECC = 3; break;
- case ISD::SETUNE:
- case ISD::SETNE: SSECC = 4; break;
- case ISD::SETULE: Swap = true; // Fallthrough
- case ISD::SETUGE: SSECC = 5; break;
- case ISD::SETULT: Swap = true; // Fallthrough
- case ISD::SETUGT: SSECC = 6; break;
- case ISD::SETO: SSECC = 7; break;
- case ISD::SETUEQ:
- case ISD::SETONE: SSECC = 8; break;
- }
- if (Swap)
- std::swap(Op0, Op1);
+ unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
// In the two special cases we can't handle, emit two comparisons.
if (SSECC == 8) {
@@ -9351,8 +9714,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
unsigned Opc;
- bool Swap = false, Invert = false, FlipSigns = false;
-
+ bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETNE: Invert = true;
@@ -9366,6 +9729,23 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
case ISD::SETUGE: Swap = true;
case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
}
+
+ // Special case: Use min/max operations for SETULE/SETUGE
+ MVT VET = VT.getVectorElementType();
+ bool hasMinMax =
+ (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+ || (Subtarget->hasSSE2() && (VET == MVT::i8));
+
+ if (hasMinMax) {
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
+ case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+ }
+
+ if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+ }
+
if (Swap)
std::swap(Op0, Op1);
@@ -9399,8 +9779,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
// Create masks for only the low parts/high parts of the 64 bit integers.
- const int MaskHi[] = { 1, 1, 3, 3 };
- const int MaskLo[] = { 0, 0, 2, 2 };
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskLo[] = { 0, 0, 2, 2 };
SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
@@ -9427,7 +9807,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
// Make sure the lower and upper halves are both all-ones.
- const int Mask[] = { 1, 0, 3, 2 };
+ static const int Mask[] = { 1, 0, 3, 2 };
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
@@ -9452,6 +9832,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
+
+ if (MinMax)
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
return Result;
}
@@ -9560,8 +9943,30 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
SDLoc DL(Op);
+ EVT VT = Op1.getValueType();
SDValue CC;
+ // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available. Otherwise fp cmovs get lowered into a less efficient branch
+ // sequence later on.
+ if (Cond.getOpcode() == ISD::SETCC &&
+ ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+ (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+ VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+ SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+ int SSECC = translateX86FSETCC(
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+ if (SSECC != 8) {
+ unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
+ SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+ DAG.getConstant(SSECC, MVT::i8));
+ SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+ SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+ return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+ }
+ }
+
if (Cond.getOpcode() == ISD::SETCC) {
SDValue NewCond = LowerSETCC(Cond, DAG);
if (NewCond.getNode())
@@ -10889,8 +11294,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
X86CC = X86::COND_E;
break;
}
- SmallVector<SDValue, 5> NewOps;
- NewOps.append(Op->op_begin()+1, Op->op_end());
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -10907,8 +11311,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
else
Opcode = X86ISD::PCMPESTRI;
- SmallVector<SDValue, 5> NewOps;
- NewOps.append(Op->op_begin()+1, Op->op_end());
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
}
@@ -11492,7 +11895,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
"Should not custom lower when pmuldq is available!");
// Extract the odd parts.
- const int UnpackMask[] = { 1, -1, 3, -1 };
+ static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
@@ -11506,7 +11909,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
- const int ShufMask[] = { 0, 4, 2, 6 };
+ static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
@@ -11560,9 +11963,11 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
APInt SplatValue, SplatUndef;
- unsigned MinSplatBits;
+ unsigned SplatBitSize;
bool HasAnyUndefs;
- if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+ if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+ HasAnyUndefs) ||
+ EltTy.getSizeInBits() < SplatBitSize)
return SDValue();
if ((SplatValue != 0) &&
@@ -12706,6 +13111,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SHLD: return "X86ISD::SHLD";
case X86ISD::SHRD: return "X86ISD::SHRD";
case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FANDN: return "X86ISD::FANDN";
case X86ISD::FOR: return "X86ISD::FOR";
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FSRL: return "X86ISD::FSRL";
@@ -12829,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
@@ -12917,6 +13324,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return NumBits1 > NumBits2;
}
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+
+ if (!isTypeLegal(EVT::getEVT(Ty1)))
+ return false;
+
+ assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+ // Assuming the caller doesn't have a zeroext or signext return parameter,
+ // truncation all the way down to i1 is valid.
+ return true;
+}
+
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<32>(Imm);
}
@@ -12968,6 +13389,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+ if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+ return false;
+
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
@@ -14434,12 +14876,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
} else {
// __chkstk(MSVCRT): does not update stack pointer.
// Clobbers R10, R11 and EFLAGS.
- // FIXME: RAX(allocated size) might be reused and not killed.
BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
.addExternalSymbol("__chkstk")
.addReg(X86::RAX, RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to subtracted from RSP.
+ // RAX has the offset to be subtracted from RSP.
BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
.addReg(X86::RSP)
.addReg(X86::RAX);
@@ -16299,6 +16740,38 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ (!Subtarget->hasInt256() ||
+ (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+ return SDValue();
+
+ SDValue Amt = N->getOperand(1);
+ SDLoc DL(N);
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+ APInt ShiftAmt = C->getAPIntValue();
+ unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+
+ // SSE2/AVX2 logical shifts always return a vector of 0s
+ // if the shift amount is bigger than or equal to
+ // the element size. The constant shift amount will be
+ // encoded as a 8-bit immediate.
+ if (ShiftAmt.trunc(8).uge(MaxAmount))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+ }
+ }
+
+ return SDValue();
+}
+
/// PerformShiftCombine - Combine shifts.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -16308,6 +16781,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
if (V.getNode()) return V;
}
+ if (N->getOpcode() != ISD::SRA) {
+ // Try to fold this logical shift into a zero vector.
+ SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
+ if (V.getNode()) return V;
+ }
+
return SDValue();
}
@@ -17253,7 +17732,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
- EVT VT = LHS.getValueType();
+ MVT VT = LHS.getValueType().getSimpleVT();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
@@ -17324,23 +17803,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// LHS = VECTOR_SHUFFLE A, B, LMask
// RHS = VECTOR_SHUFFLE A, B, RMask
// Check that the masks correspond to performing a horizontal operation.
- for (unsigned i = 0; i != NumElts; ++i) {
- int LIdx = LMask[i], RIdx = RMask[i];
-
- // Ignore any UNDEF components.
- if (LIdx < 0 || RIdx < 0 ||
- (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
- (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
- continue;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+ // Ignore any UNDEF components.
+ if (LIdx < 0 || RIdx < 0 ||
+ (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ continue;
- // Check that successive elements are being operated on. If not, this is
- // not a horizontal operation.
- unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
- unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
- int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
- if (!(LIdx == Index && RIdx == Index + 1) &&
- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
- return false;
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+ int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+ if (!(LIdx == Index && RIdx == Index + 1) &&
+ !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+ return false;
+ }
}
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
@@ -17428,6 +17908,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+ // FANDN(x, 0.0) -> 0.0
+ // FANDN(0.0, x) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ return SDValue();
+}
+
static SDValue PerformBTCombine(SDNode *N,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
@@ -17882,6 +18375,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMIN:
case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
case ISD::ANY_EXTEND:
@@ -18423,7 +18917,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::pair<unsigned, const TargetRegisterClass*>
X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
- EVT VT) const {
+ MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
if (Constraint.size() == 1) {
@@ -18490,7 +18984,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget->hasSSE1()) break;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
@@ -18515,6 +19009,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v8f32:
case MVT::v4f64:
return std::make_pair(0U, &X86::VR256RegClass);
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ return std::make_pair(0U, &X86::VR512RegClass);
}
break;
}
@@ -18625,7 +19124,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
} else if (Res.second == &X86::FR32RegClass ||
Res.second == &X86::FR64RegClass ||
- Res.second == &X86::VR128RegClass) {
+ Res.second == &X86::VR128RegClass ||
+ Res.second == &X86::VR256RegClass ||
+ Res.second == &X86::FR32XRegClass ||
+ Res.second == &X86::FR64XRegClass ||
+ Res.second == &X86::VR128XRegClass ||
+ Res.second == &X86::VR256XRegClass ||
+ Res.second == &X86::VR512RegClass) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
@@ -18639,6 +19144,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.second = &X86::VR128RegClass;
else if (X86::VR256RegClass.hasType(VT))
Res.second = &X86::VR256RegClass;
+ else if (X86::VR512RegClass.hasType(VT))
+ Res.second = &X86::VR512RegClass;
}
return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index c0e1015..2703274 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -53,6 +53,10 @@ namespace llvm {
/// to X86::XORPS or X86::XORPD.
FXOR,
+ /// FAND - Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
/// FSRL - Bitwise logical right shift of floating point values. These
/// corresponds to X86::PSRLDQ.
FSRL,
@@ -290,6 +294,10 @@ namespace llvm {
// TESTP - Vector packed fp sign bitwise comparisons
TESTP,
+ // OR/AND test for masks
+ KORTEST,
+ KTEST,
+
// Several flavors of instructions with vector shuffle behaviors.
PALIGNR,
PSHUFD,
@@ -313,6 +321,8 @@ namespace llvm {
VPERMI,
VPERM2X128,
VBROADCAST,
+ // masked broadcast
+ VBROADCASTM,
// PMULUDQ - Vector multiply packed unsigned doubleword integers
PMULUDQ,
@@ -434,25 +444,45 @@ namespace llvm {
/// Define some predicates that are used for node matching.
namespace X86 {
- /// isVEXTRACTF128Index - Return true if the specified
+ /// isVEXTRACT128Index - Return true if the specified
+ /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+ /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+ bool isVEXTRACT128Index(SDNode *N);
+
+ /// isVINSERT128Index - Return true if the specified
+ /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+ /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+ bool isVINSERT128Index(SDNode *N);
+
+ /// isVEXTRACT256Index - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
- /// suitable for input to VEXTRACTF128.
- bool isVEXTRACTF128Index(SDNode *N);
+ /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+ bool isVEXTRACT256Index(SDNode *N);
- /// isVINSERTF128Index - Return true if the specified
+ /// isVINSERT256Index - Return true if the specified
/// INSERT_SUBVECTOR operand specifies a subvector insert that is
- /// suitable for input to VINSERTF128.
- bool isVINSERTF128Index(SDNode *N);
+ /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+ bool isVINSERT256Index(SDNode *N);
- /// getExtractVEXTRACTF128Immediate - Return the appropriate
+ /// getExtractVEXTRACT128Immediate - Return the appropriate
/// immediate to extract the specified EXTRACT_SUBVECTOR index
- /// with VEXTRACTF128 instructions.
- unsigned getExtractVEXTRACTF128Immediate(SDNode *N);
+ /// with VEXTRACTF128, VEXTRACTI128 instructions.
+ unsigned getExtractVEXTRACT128Immediate(SDNode *N);
- /// getInsertVINSERTF128Immediate - Return the appropriate
+ /// getInsertVINSERT128Immediate - Return the appropriate
/// immediate to insert at the specified INSERT_SUBVECTOR index
- /// with VINSERTF128 instructions.
- unsigned getInsertVINSERTF128Immediate(SDNode *N);
+ /// with VINSERTF128, VINSERT128 instructions.
+ unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+ /// getExtractVEXTRACT256Immediate - Return the appropriate
+ /// immediate to extract the specified EXTRACT_SUBVECTOR index
+ /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+ unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+ /// getInsertVINSERT256Immediate - Return the appropriate
+ /// immediate to insert at the specified INSERT_SUBVECTOR index
+ /// with VINSERTF64x4, VINSERTI64x4 instructions.
+ unsigned getInsertVINSERT256Immediate(SDNode *N);
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
@@ -610,7 +640,7 @@ namespace llvm {
/// error, this returns a register number of 0.
std::pair<unsigned, const TargetRegisterClass*>
getRegForInlineAsmConstraint(const std::string &Constraint,
- EVT VT) const;
+ MVT VT) const;
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
@@ -634,6 +664,8 @@ namespace llvm {
virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+ virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
/// isZExtFree - Return true if any actual instruction that defines a
/// value of type Ty1 implicit zero-extends the value to Ty2 in the result
/// register. This does not necessarily include registers defined in
@@ -646,11 +678,11 @@ namespace llvm {
virtual bool isZExtFree(EVT VT1, EVT VT2) const;
virtual bool isZExtFree(SDValue Val, EVT VT2) const;
- /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
- /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
- /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
- /// is expanded to mul + add.
- virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+ /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+ /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+ /// expanded to FMAs when this method returns true, otherwise fmuladd is
+ /// expanded to fmul + fadd.
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
/// isNarrowingProfitable - Return true if it's profitable to narrow
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
@@ -802,6 +834,7 @@ namespace llvm {
SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
@@ -821,7 +854,9 @@ namespace llvm {
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..8abae14
--- /dev/null
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,715 @@
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
+ def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+ def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>;
+ def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+// -- 32x8 form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+ "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, f128mem:$src2, i8imm:$src3),
+ "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+}
+
+// -- 64x4 fp form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in {
+def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+ "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+ "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+// -- 32x4 integer form --
+let neverHasSideEffects = 1 in {
+def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+ "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i128mem:$src2, i8imm:$src3),
+ "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+}
+
+let neverHasSideEffects = 1 in {
+// -- 64x4 form --
+def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
+ (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+ "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst),
+ (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+ "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2),
+ (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2),
+ (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2),
+ (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
+ (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
+ (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2),
+ (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2),
+ (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2),
+ (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2),
+ (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2),
+ (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2),
+ (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2),
+ (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2),
+ (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2),
+ (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
+ (bc_v8i32 (loadv4i64 addr:$src2)),
+ (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+ (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
+ "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+ (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
+ "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+// -- 32x4 form --
+def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512;
+def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs),
+ (ins f128mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
+ (ins f256mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+let neverHasSideEffects = 1 in {
+// -- 32x4 form --
+def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512;
+def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst),
+ (ins VR512:$src1, i8imm:$src2),
+ "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs),
+ (ins i256mem:$dst, VR512:$src1, i8imm:$src2),
+ "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+ (v4f32 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)),
+ (v4i32 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+ (v2f64 (VEXTRACTF32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+ (v2i64 (VEXTRACTI32x4rr VR512:$src1,
+ (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+
+def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+ (v8f32 (VEXTRACTF64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)),
+ (v8i32 (VEXTRACTI64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+ (v4f64 (VEXTRACTF64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+ (v4i64 (VEXTRACTI64x4rr VR512:$src1,
+ (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+// A 256-bit subvector extract from the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+
+// zmm -> xmm
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+ (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+ (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+ (ins VR128X:$src1, u32u8imm:$src2),
+ "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
+ "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+ addr:$dst)]>, EVEX;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr,
+ RegisterClass DestRC,
+ RegisterClass SrcRC, X86MemOperand x86memop> {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX;
+ def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX;
+}
+let ExeDomain = SSEPackedSingle in {
+ defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512,
+ VR128X, f32mem>,
+ EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+ defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512,
+ VR128X, f64mem>,
+ EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VBROADCASTSDZrm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
+ RegisterClass SrcRC, RegisterClass KRC> {
+ def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, EVEX_V512;
+ def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst),
+ (ins KRC:$mask, SrcRC:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+ []>, EVEX, EVEX_V512, EVEX_KZ;
+}
+
+defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
+ VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+ (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+ (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+ (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+ (VPBROADCASTQrZrr GR64:$src)>;
+
+multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
+ RegisterClass KRC> {
+ def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
+ def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
+ VR128X:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
+ EVEX, EVEX_KZ;
+ def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst,
+ (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
+ def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
+ x86memop:$src),
+ !strconcat(OpcodeStr,
+ "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+ [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask,
+ (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+}
+
+defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
+ loadi32, VR512, v16i32, v4i32, VK16WM>,
+ EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
+ loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W,
+ EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
+ (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
+ (VBROADCASTSDZrr VR128X:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+ (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+ (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
+ (EXTRACT_SUBREG
+ (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ addr:$src)), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ RegisterClass DstRC, RegisterClass KRC,
+ ValueType OpVT, ValueType SrcVT> {
+def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ []>, EVEX;
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
+ VK16, v16i32, v16i1>, EVEX_V512;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
+ VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+ string OpcodeStr, RegisterClass KRC,
+ ValueType vt, X86MemOperand x86memop> {
+ let neverHasSideEffects = 1 in {
+ def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ let mayLoad = 1 in
+ def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (vt (load addr:$src)))]>;
+ let mayStore = 1 in
+ def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+ string OpcodeStr,
+ RegisterClass KRC, RegisterClass GRC> {
+ let neverHasSideEffects = 1 in {
+ def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ }
+}
+
+let Predicates = [HasAVX512] in {
+ defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+ VEX, TB;
+ defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+ VEX, TB;
+}
+
+let Predicates = [HasAVX512] in {
+ // GR16 from/to 16-bit mask
+ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+ def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+
+ // Store kreg in memory
+ def : Pat<(store (v16i1 VK16:$src), addr:$dst),
+ (KMOVWmk addr:$dst, VK16:$src)>;
+
+ def : Pat<(store (v8i1 VK8:$src), addr:$dst),
+ (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>;
+}
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ // GR from/to 8-bit mask without native support
+ def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS
+ (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ VK8)>;
+ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (EXTRACT_SUBREG
+ (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+ sub_8bit)>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TB;
+}
+
+defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
+
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+def : Pat<(not VK8:$src),
+ (COPY_TO_REGCLASS
+ (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+// Mask binary operation
+// - KADD, KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC, SDPatternOperator OpNode> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode> {
+ defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX_4V, VEX_L, TB;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+let isCommutable = 1 in {
+ defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>;
+ defm KAND : avx512_mask_binop_w<0x41, "kand", and>;
+ let isCommutable = 0 in
+ defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
+ defm KOR : avx512_mask_binop_w<0x45, "kor", or>;
+ defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>;
+ defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>;
+}
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+ VK16:$src1, VK16:$src2),
+ (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>;
+}
+
+defm : avx512_mask_binop_int<"kadd", "KADD">;
+defm : avx512_mask_binop_int<"kand", "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor", "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor", "KXOR">;
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(OpNode VK8:$src1, VK8:$src2),
+ (COPY_TO_REGCLASS
+ (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+ (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+}
+
+defm : avx512_binop_pat<and, KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or, KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
+ RegisterClass KRC1, RegisterClass KRC2> {
+ let Predicates = [HasAVX512] in
+ def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+}
+
+multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
+ defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>,
+ VEX_4V, VEX_L, OpSize, TB;
+}
+
+defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
+
+multiclass avx512_mask_unpck_int<string IntName, string InstName> {
+ let Predicates = [HasAVX512] in
+ def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+ VK8:$src1, VK8:$src2),
+ (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>;
+}
+
+defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">;
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512], Defs = [EFLAGS] in
+ def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, TB;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+ SDNode OpNode> {
+ let Predicates = [HasAVX512] in
+ def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
+ !strconcat(OpcodeStr,
+ "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+ SDNode OpNode> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+ VEX, OpSize, TA, VEX_W;
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+ let Predicates = [HasAVX512] in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+ def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+ defm B : avx512_mask_setop<VK8, v8i1, Val>;
+ defm W : avx512_mask_setop<VK16, v16i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+ (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+ (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index fa2b2d8..9ce02ba 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1041,13 +1041,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
} // Defs = [EFLAGS]
def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
+ "{$src, %al|al, $src}">;
def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
+ "{$src, %ax|ax, $src}">;
def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
+ "{$src, %eax|eax, $src}">;
def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
+ "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -1112,13 +1112,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
} // Uses = [EFLAGS], Defs = [EFLAGS]
def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
+ "{$src, %al|al, $src}">;
def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
+ "{$src, %ax|ax, $src}">;
def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
+ "{$src, %eax|eax, $src}">;
def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
+ "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1179,13 +1179,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
} // Defs = [EFLAGS]
def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
+ "{$src, %al|al, $src}">;
def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
+ "{$src, %ax|ax, $src}">;
def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
+ "{$src, %eax|eax, $src}">;
def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
+ "{$src, %rax|rax, $src}">;
}
@@ -1253,13 +1253,13 @@ let isCompare = 1 in {
} // Defs = [EFLAGS]
def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL,
- "{$src, %al|AL, $src}">;
+ "{$src, %al|al, $src}">;
def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX,
- "{$src, %ax|AX, $src}">;
+ "{$src, %ax|ax, $src}">;
def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
+ "{$src, %eax|eax, $src}">;
def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
+ "{$src, %rax|rax, $src}">;
} // isCompare
//===----------------------------------------------------------------------===//
@@ -1302,12 +1302,12 @@ let neverHasSideEffects = 1 in {
let isCommutable = 1 in
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>;
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>;
+ [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
}
}
@@ -1336,7 +1336,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"adcx{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8, OpSize;
-
+
def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"adcx{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
@@ -1361,7 +1361,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"adox{l}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8XS;
-
+
def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"adox{q}\t{$src, $dst|$dst, $src}",
[], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 8a7ee7d..8969946 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
// The MSVC runtime contains an _ftol2 routine for converting floating-point
// to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. No
-// other registers (aside from flags) are touched.
+// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
+// used as a temporary register. No other registers (aside from flags) are
+// touched.
// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
// variant is unnecessary.
-let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
"# win32 fptoui",
[(X86WinFTOL RFP32:$src)]>,
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 2224a08..7c37888 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm>
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
-def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">;
def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
-def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
-def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
-def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">;
def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
-def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
-def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">;
@@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovb\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
+ "fcmove\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
+ "fcmovu\t{$op, %st(0)|st(0), $op}">, DA;
def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovne\t{$op, %st(0)|st(0), $op}">, DB;
def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
- "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
+ "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB;
} // Predicates = [HasCMov]
// Floating point loads & stores.
@@ -578,7 +578,7 @@ def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
let SchedRW = [WriteALU] in {
let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags
- (outs), (ins), "fnstsw %ax",
+ (outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 1432414..64018b3 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -96,6 +96,20 @@ def SSEPackedSingle : Domain<1>;
def SSEPackedDouble : Domain<2>;
def SSEPackedInt : Domain<3>;
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+ bits<3> Value = val;
+}
+def CD8VF : CD8VForm<0>; // v := VL
+def CD8VH : CD8VForm<1>; // v := VL/2
+def CD8VQ : CD8VForm<2>; // v := VL/4
+def CD8VO : CD8VForm<3>; // v := VL/8
+def CD8VT1 : CD8VForm<4>; // v := 1
+def CD8VT2 : CD8VForm<5>; // v := 2
+def CD8VT4 : CD8VForm<6>; // v := 4
+def CD8VT8 : CD8VForm<7>; // v := 8
+
// Prefix byte classes which are used to indicate to the ad-hoc machine code
// emitter that various prefix bytes are required.
class OpSize { bit hasOpSizePrefix = 1; }
@@ -132,6 +146,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { bit hasEVEXPrefix = 1; }
+class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_CD8<int esize, CD8VForm form> {
+ bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00,
+ !if(!eq(esize, 16), 0b01,
+ !if(!eq(esize, 32), 0b10,
+ !if(!eq(esize, 64), 0b11, ?))));
+ bits<3> EVEX_CD8V = form.Value;
+}
class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
class MemOp4 { bit hasMemOp4Prefix = 1; }
class XOP { bit hasXOP_Prefix = 1; }
@@ -177,6 +204,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// to be encoded in a immediate field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
+ bit hasEVEXPrefix = 0; // Does this inst require EVEX form?
+ bit hasEVEX_K = 0; // Does this inst require masking?
+ bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field?
+ bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field?
+ bit hasEVEX_B = 0; // Does this inst set the EVEX_B field?
+ bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size.
+ bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width.
bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix?
@@ -200,9 +234,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{37} = hasVEX_i8ImmReg;
let TSFlags{38} = hasVEX_L;
let TSFlags{39} = ignoresVEX_L;
- let TSFlags{40} = has3DNow0F0FOpcode;
- let TSFlags{41} = hasMemOp4Prefix;
- let TSFlags{42} = hasXOP_Prefix;
+ let TSFlags{40} = hasEVEXPrefix;
+ let TSFlags{41} = hasEVEX_K;
+ let TSFlags{42} = hasEVEX_Z;
+ let TSFlags{43} = hasEVEX_L2;
+ let TSFlags{44} = hasEVEX_B;
+ let TSFlags{46-45} = EVEX_CD8E;
+ let TSFlags{49-47} = EVEX_CD8V;
+ let TSFlags{50} = has3DNow0F0FOpcode;
+ let TSFlags{51} = hasMemOp4Prefix;
+ let TSFlags{52} = hasXOP_Prefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -553,6 +594,74 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
Requires<[HasAVX2]>;
+
+// AVX-512 Instruction Templates:
+// Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+// AVX5128I - AVX-512 instructions with T8 and OpSize prefix.
+// AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8.
+// AVX512PDI - AVX-512 instructions with TB, OpSize, double packed.
+// AVX512PSI - AVX-512 instructions with TB, single packed.
+// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+// AVX512XSI - AVX-512 instructions with XS prefix, generic domain.
+// AVX512BI - AVX-512 instructions with TB, OpSize, int packed domain.
+// AVX512SI - AVX-512 scalar instructions with TB and OpSize prefixes.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+ Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, XS,
+ Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+ Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+ Requires<[HasAVX512]>;
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+ Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB,
+ OpSize, Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+ Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8,
+ OpSize, EVEX_4V, Requires<[HasAVX512]>;
+
// AES Instruction Templates:
//
// AES8I
@@ -628,6 +737,13 @@ class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
let CodeSize = 3;
}
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 2a72fb6..0b51521 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -47,6 +47,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
@@ -136,6 +138,8 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -153,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+
def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
@@ -192,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
@@ -405,28 +412,54 @@ def BYTE_imm : SDNodeXForm<imm, [{
return getI32Imm(N->getZExtValue() >> 3);
}]>;
-// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
-// to VEXTRACTF128 imm.
-def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
- return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT128Immediate(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT128Immediate(N));
}]>;
-// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
-// VINSERTF128 imm.
-def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
- return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+ return getI8Imm(X86::getExtractVEXTRACT256Immediate(N));
}]>;
-def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+ return getI8Imm(X86::getInsertVINSERT256Immediate(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+ (extract_subvector node:$bigvec,
+ node:$index), [{
+ return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+ node:$index),
+ (insert_subvector node:$bigvec, node:$smallvec,
+ node:$index), [{
+ return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
node:$index), [{
- return X86::isVEXTRACTF128Index(N);
-}], EXTRACT_get_vextractf128_imm>;
+ return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
-def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
(insert_subvector node:$bigvec, node:$smallvec,
node:$index), [{
- return X86::isVINSERTF128Index(N);
-}], INSERT_get_vinsertf128_imm>;
+ return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index df7b721..0443a93 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3733,19 +3733,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return false;
}
-MachineInstr*
-X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
- int FrameIx, uint64_t Offset,
- const MDNode *MDPtr,
- DebugLoc DL) const {
- X86AddressMode AM;
- AM.BaseType = X86AddressMode::FrameIndexBase;
- AM.Base.FrameIndex = FrameIx;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
- addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
- return &*MIB;
-}
-
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
const SmallVectorImpl<MachineOperand> &MOs,
MachineInstr *MI,
@@ -4660,6 +4647,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const {
+ // Check if this processor supports macro-fusion. Since this is a minor
+ // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+ // proxy for SandyBridge+.
+ if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+ return false;
+
+ enum {
+ FuseTest,
+ FuseCmp,
+ FuseInc
+ } FuseKind;
+
+ switch(Second->getOpcode()) {
+ default:
+ return false;
+ case X86::JE_4:
+ case X86::JNE_4:
+ case X86::JL_4:
+ case X86::JLE_4:
+ case X86::JG_4:
+ case X86::JGE_4:
+ FuseKind = FuseInc;
+ break;
+ case X86::JB_4:
+ case X86::JBE_4:
+ case X86::JA_4:
+ case X86::JAE_4:
+ FuseKind = FuseCmp;
+ break;
+ case X86::JS_4:
+ case X86::JNS_4:
+ case X86::JP_4:
+ case X86::JNP_4:
+ case X86::JO_4:
+ case X86::JNO_4:
+ FuseKind = FuseTest;
+ break;
+ }
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ case X86::TEST8ri:
+ case X86::TEST16ri:
+ case X86::TEST32ri:
+ case X86::TEST32i32:
+ case X86::TEST64i32:
+ case X86::TEST64ri32:
+ case X86::TEST8rm:
+ case X86::TEST16rm:
+ case X86::TEST32rm:
+ case X86::TEST64rm:
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ return true;
+ case X86::CMP16i16:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP32i32:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP64i32:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP8i8:
+ case X86::CMP8ri:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16ri8_DB:
+ case X86::ADD16ri_DB:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_DB:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32ri8_DB:
+ case X86::ADD32ri_DB:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_DB:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri32_DB:
+ case X86::ADD64ri8:
+ case X86::ADD64ri8_DB:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_DB:
+ case X86::ADD8i8:
+ case X86::ADD8mi:
+ case X86::ADD8mr:
+ case X86::ADD8ri:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ return FuseKind == FuseCmp || FuseKind == FuseInc;
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64_16r:
+ case X86::INC64_32r:
+ case X86::INC64r:
+ case X86::INC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64_16r:
+ case X86::DEC64_32r:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FuseKind == FuseInc;
+ }
+}
bool X86InstrInfo::
ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 332874f..a0d1ba7 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -275,12 +275,6 @@ public:
virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
- virtual
- MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
- int FrameIx, uint64_t Offset,
- const MDNode *MDPtr,
- DebugLoc DL) const;
-
/// foldMemoryOperand - If this target supports it, fold a load or store of
/// the specified stack slot into the specified machine instruction for the
/// specified operand(s). If this is possible, the target should perform the
@@ -345,6 +339,9 @@ public:
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const;
+ virtual bool shouldScheduleAdjacent(MachineInstr* First,
+ MachineInstr *Second) const LLVM_OVERRIDE;
+
virtual void getNoopForMachoTarget(MCInst &NopInst) const;
virtual
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 817bd6c..0960a2a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -317,6 +317,16 @@ def X86MemVY64Operand : AsmOperandClass {
let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
}
+def X86MemVZ64Operand : AsmOperandClass {
+ let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+ let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+ let Name = "Mem512"; let PredicateMethod = "isMem512";
+}
+
def X86AbsMemAsmOperand : AsmOperandClass {
let Name = "AbsMem";
let SuperClasses = [X86MemAsmOperand];
@@ -345,6 +355,8 @@ def i128mem : X86MemOperand<"printi128mem"> {
let ParserMatchClass = X86Mem128AsmOperand; }
def i256mem : X86MemOperand<"printi256mem"> {
let ParserMatchClass = X86Mem256AsmOperand; }
+def i512mem : X86MemOperand<"printi512mem"> {
+ let ParserMatchClass = X86Mem512AsmOperand; }
def f32mem : X86MemOperand<"printf32mem"> {
let ParserMatchClass = X86Mem32AsmOperand; }
def f64mem : X86MemOperand<"printf64mem"> {
@@ -355,6 +367,12 @@ def f128mem : X86MemOperand<"printf128mem"> {
let ParserMatchClass = X86Mem128AsmOperand; }
def f256mem : X86MemOperand<"printf256mem">{
let ParserMatchClass = X86Mem256AsmOperand; }
+def f512mem : X86MemOperand<"printf512mem">{
+ let ParserMatchClass = X86Mem512AsmOperand; }
+def v512mem : Operand<iPTR> {
+ let PrintMethod = "printf512mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86Mem512AsmOperand; }
// Gather mem operands
def vx32mem : X86MemOperand<"printi32mem">{
@@ -369,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{
def vy64mem : X86MemOperand<"printi64mem">{
let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
let ParserMatchClass = X86MemVY64Operand; }
+def vy64xmem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm);
+ let ParserMatchClass = X86MemVY64Operand; }
+def vz32mem : X86MemOperand<"printi32mem">{
+ let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86MemVZ32Operand; }
+def vz64mem : X86MemOperand<"printi64mem">{
+ let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+ let ParserMatchClass = X86MemVZ64Operand; }
}
// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -590,11 +617,19 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
+def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI : Predicate<"Subtarget->hasCDI()">;
+def HasPFI : Predicate<"Subtarget->hasPFI()">;
+def HasEMI : Predicate<"Subtarget->hasERI()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
@@ -803,11 +838,11 @@ def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
IIC_POP_REG>;
def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
IIC_POP_REG>, OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
IIC_POP_MEM>, OpSize;
def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
IIC_POP_REG>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
IIC_POP_MEM>;
def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
@@ -851,7 +886,7 @@ def POP64r : I<0x58, AddRegFrm,
(outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
IIC_POP_REG>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
IIC_POP_MEM>;
} // mayLoad, SchedRW
let mayStore = 1, SchedRW = [WriteStore] in {
@@ -1041,40 +1076,52 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
/// 32-bit offset from the PC. These are only valid in x86-32 mode.
let SchedRW = [WriteALU] in {
def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
- "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
+ "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
- "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
+ "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
- "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
+ "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
- "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
+ "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
- "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
+ "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize,
Requires<[In32BitMode]>;
def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
- "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
+ "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
Requires<[In32BitMode]>;
}
-// FIXME: These definitions are utterly broken
-// Just leave them commented out for now because they're useless outside
-// of the large code model, and most compilers won't generate the instructions
-// in question.
-/*
-def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
- "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
- "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
- "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
- "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-*/
-
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src),
+ "movabs{b}\t{$src, %al|al, $src}", []>,
+ Requires<[In64BitMode]>;
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
+ Requires<[In64BitMode]>;
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{l}\t{$src, %eax|eax, $src}", []>,
+ Requires<[In64BitMode]>;
+def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "movabs{q}\t{$src, %rax|rax, $src}", []>,
+ Requires<[In64BitMode]>;
+
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{b}\t{%al, $dst|$dst, al}", []>,
+ Requires<[In64BitMode]>;
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
+ Requires<[In64BitMode]>;
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
+ Requires<[In64BitMode]>;
+def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
+ Requires<[In64BitMode]>;
let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
@@ -1407,17 +1454,17 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
// Swap between EAX and other registers.
def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
- "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
+ "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize;
def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
Requires<[In32BitMode]>;
// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
- "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+ "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
Requires<[In64BitMode]>;
def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
- "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
+ "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
} // SchedRW
let SchedRW = [WriteALU] in {
@@ -1814,6 +1861,7 @@ include "X86InstrXOP.td"
// SSE, MMX and 3DNow! vector support.
include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
include "X86InstrMMX.td"
include "X86Instr3DNow.td"
@@ -1921,29 +1969,31 @@ def : MnemonicAlias<"fucomip", "fucompi", "att">;
def : MnemonicAlias<"fwait", "wait", "att">;
-class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond>
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+ string VariantName>
: MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
- !strconcat(Prefix, NewCond, Suffix)>;
+ !strconcat(Prefix, NewCond, Suffix), VariantName>;
/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
/// example "setz" -> "sete".
-multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> {
- def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb
- def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete
- def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe
- def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae
- def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae
- def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle
- def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge
- def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne
- def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp
- def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp
-
- def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb
- def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta
- def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl
- def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+ string V = ""> {
+ def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb
+ def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete
+ def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe
+ def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae
+ def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae
+ def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle
+ def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge
+ def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne
+ def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp
+ def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp
+
+ def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb
+ def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta
+ def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl
+ def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg
}
// Aliases for set<CC>
@@ -1951,9 +2001,11 @@ defm : IntegerCondCodeMnemonicAlias<"set", "">;
// Aliases for j<CC>
defm : IntegerCondCodeMnemonicAlias<"j", "">;
// Aliases for cmov<CC>{w,l,q}
-defm : IntegerCondCodeMnemonicAlias<"cmov", "w">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "l">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "q">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
//===----------------------------------------------------------------------===//
@@ -1965,75 +2017,83 @@ def : InstAlias<"aad", (AAD8i8 10)>;
def : InstAlias<"aam", (AAM8i8 10)>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
-def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>;
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+ (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+ (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+ (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+ (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
// clr aliases.
-def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>;
-def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
-def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
-def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
// div and idiv aliases for explicit A register.
-def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>;
-def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>;
-def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>;
-def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>;
-def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>;
-def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>;
-def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>;
-def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
// Various unary fpstack operations default to operating on on ST1.
// For example, "fxch" -> "fxch %st(1)"
def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
-def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>;
-def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>;
-def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>;
-def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>;
-def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>;
-def : InstAlias<"fxch", (XCH_F ST1)>;
-def : InstAlias<"fcom", (COM_FST0r ST1)>;
-def : InstAlias<"fcomp", (COMP_FST0r ST1)>;
-def : InstAlias<"fcomi", (COM_FIr ST1)>;
-def : InstAlias<"fcompi", (COM_FIPr ST1)>;
-def : InstAlias<"fucom", (UCOM_Fr ST1)>;
-def : InstAlias<"fucomp", (UCOM_FPr ST1)>;
-def : InstAlias<"fucomi", (UCOM_FIr ST1)>;
-def : InstAlias<"fucompi", (UCOM_FIPr ST1)>;
+def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
+def : InstAlias<"fxch", (XCH_F ST1), 0>;
+def : InstAlias<"fcom", (COM_FST0r ST1), 0>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>;
+def : InstAlias<"fcomi", (COM_FIr ST1), 0>;
+def : InstAlias<"fcompi", (COM_FIPr ST1), 0>;
+def : InstAlias<"fucom", (UCOM_Fr ST1), 0>;
+def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>;
+def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>;
+def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>;
// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate
// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
// gas.
multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
- def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
(Inst RST:$op), EmitAlias>;
- def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
(Inst ST0), EmitAlias>;
}
defm : FpUnaryAlias<"fadd", ADD_FST0r>;
defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>;
defm : FpUnaryAlias<"fsub", SUB_FST0r>;
-defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>;
defm : FpUnaryAlias<"fsubr", SUBR_FST0r>;
-defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
defm : FpUnaryAlias<"fmul", MUL_FST0r>;
defm : FpUnaryAlias<"fmulp", MUL_FPrST0>;
defm : FpUnaryAlias<"fdiv", DIV_FST0r>;
-defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>;
defm : FpUnaryAlias<"fdivr", DIVR_FST0r>;
-defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
defm : FpUnaryAlias<"fcomi", COM_FIr, 0>;
defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>;
defm : FpUnaryAlias<"fcompi", COM_FIPr>;
@@ -2043,16 +2103,16 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>;
// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
// commute. We also allow fdiv[r]p/fsubrp even though they don't commute,
// solely because gas supports it.
-def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
-def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
-def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
-def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
-def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
-def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
// We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
def : InstAlias<"fnstsw" , (FNSTSW16r)>;
// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but
@@ -2071,12 +2131,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
// inb %dx -> inb %al, %dx
-def : InstAlias<"inb %dx", (IN8rr)>;
-def : InstAlias<"inw %dx", (IN16rr)>;
-def : InstAlias<"inl %dx", (IN32rr)>;
-def : InstAlias<"inb $port", (IN8ri i8imm:$port)>;
-def : InstAlias<"inw $port", (IN16ri i8imm:$port)>;
-def : InstAlias<"inl $port", (IN32ri i8imm:$port)>;
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
@@ -2104,7 +2164,7 @@ def : InstAlias<"movq $src, $dst",
// movsd with no operands (as opposed to the SSE scalar move of a double) is an
// alias for movsl. (as in rep; movsd)
-def : InstAlias<"movsd", (MOVSD)>;
+def : InstAlias<"movsd", (MOVSD), 0>;
// movsx aliases
def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
@@ -2125,12 +2185,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
// Note: No GR32->GR64 movzx form.
// outb %dx -> outb %al, %dx
-def : InstAlias<"outb %dx", (OUT8rr)>;
-def : InstAlias<"outw %dx", (OUT16rr)>;
-def : InstAlias<"outl %dx", (OUT32rr)>;
-def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>;
-def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>;
-def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
@@ -2138,19 +2198,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
// shld/shrd op,op -> shld op, op, CL
-def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
-def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
-
-def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
-def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
/* FIXME: This is disabled because the asm matcher is currently incapable of
* matching a fixed immediate like $1.
@@ -2181,19 +2241,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
FIXME */
// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 07314a0..cb12956 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -189,13 +189,14 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
PatFrag ld_frag, string asm, Domain d> {
- def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
- asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
- NoItinerary, d>;
- def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
- (ins DstRC:$src1, x86memop:$src2), asm,
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
- NoItinerary, d>;
+ def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+ (ins DstRC:$src1, SrcRC:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ NoItinerary, d>;
+ def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2), asm,
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ NoItinerary, d>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 79b1ca3..a86006a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2843,8 +2843,8 @@ defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for,
defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
SSE_BIT_ITINS_P>;
-let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
- defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+let isCommutable = 0 in
+ defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
SSE_BIT_ITINS_P>;
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
@@ -5477,12 +5477,12 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
TB, Requires<[HasSSE3]>;
} // SchedRW
-def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
-def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
-def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
Requires<[In32BitMode]>;
-def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
@@ -6139,11 +6139,11 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
}
let ExeDomain = SSEPackedSingle in {
- let Predicates = [HasAVX] in {
+ let Predicates = [UseAVX] in {
defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
(ins VR128:$src1, i32i8imm:$src2),
- "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, OpSize, VEX;
}
defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
@@ -7016,17 +7016,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
int_x86_sse41_pblendvb>;
// Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
(PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
let Predicates = [UseSSE41] in {
@@ -7266,62 +7266,62 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
let Constraints = "$src1 = $dst" in {
def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i8mem:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_8 GR32:$src1,
(load addr:$src2)))]>;
def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR8:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i16mem:$src2),
- "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ "crc32{w}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_16 GR32:$src1,
(load addr:$src2)))]>,
OpSize;
def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR16:$src2),
- "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ "crc32{w}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
OpSize;
def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i32mem:$src2),
- "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ "crc32{l}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_32 GR32:$src1,
(load addr:$src2)))]>;
def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ "crc32{l}\t{$src2, $src1|$src1, $src2}",
[(set GR32:$dst,
(int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i8mem:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_8 GR64:$src1,
(load addr:$src2)))]>,
REX_W;
def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR8:$src2),
- "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ "crc32{b}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
REX_W;
def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
(ins GR64:$src1, i64mem:$src2),
- "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ "crc32{q}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_64 GR64:$src1,
(load addr:$src2)))]>,
REX_W;
def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ "crc32{q}\t{$src2, $src1|$src1, $src2}",
[(set GR64:$dst,
(int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
REX_W;
@@ -7586,62 +7586,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX] in {
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
(bc_v4i32 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
(bc_v16i8 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
(bc_v8i16 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
//===----------------------------------------------------------------------===//
@@ -7661,59 +7661,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
// AVX1 patterns
let Predicates = [HasAVX] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4f32 (VEXTRACTF128rr
(v8f32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2f64 (VEXTRACTF128rr
(v4f64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2i64 (VEXTRACTF128rr
(v4i64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4i32 (VEXTRACTF128rr
(v8i32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v8i16 (VEXTRACTF128rr
(v16i16 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v16i8 (VEXTRACTF128rr
(v32i8 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
//===----------------------------------------------------------------------===//
@@ -8191,42 +8191,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX2] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
(bc_v4i32 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
(bc_v16i8 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
(bc_v8i16 (memopv2i64 addr:$src2)),
(iPTR imm)),
(VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsertf128_imm VR256:$ins))>;
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
}
//===----------------------------------------------------------------------===//
@@ -8245,39 +8245,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
VEX, VEX_L;
let Predicates = [HasAVX2] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v2i64 (VEXTRACTI128rr
(v4i64 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v4i32 (VEXTRACTI128rr
(v8i32 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v8i16 (VEXTRACTI128rr
(v16i16 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
(v16i8 (VEXTRACTI128rr
(v32i8 VR256:$src1),
- (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
(iPTR imm))), addr:$dst),
(VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 757dcd0..0191c01 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
// 0F 01 D8
let Uses = [EAX] in
def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+ "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX] in
def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+ "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+ "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+ "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 89c1a689..1937770 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -18,16 +18,16 @@ let Defs = [EFLAGS] in {
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
- "shl{b}\t{%cl, $dst|$dst, CL}",
+ "shl{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
- "shl{w}\t{%cl, $dst|$dst, CL}",
+ "shl{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
- "shl{l}\t{%cl, $dst|$dst, CL}",
+ "shl{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
- "shl{q}\t{%cl, $dst|$dst, CL}",
+ "shl{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
} // Uses = [CL]
@@ -70,17 +70,17 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
// using CL?
let Uses = [CL] in {
def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
- "shl{b}\t{%cl, $dst|$dst, CL}",
+ "shl{b}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
- "shl{w}\t{%cl, $dst|$dst, CL}",
+ "shl{w}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
OpSize;
def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
- "shl{l}\t{%cl, $dst|$dst, CL}",
+ "shl{l}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
- "shl{q}\t{%cl, $dst|$dst, CL}",
+ "shl{q}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -124,16 +124,16 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
- "shr{b}\t{%cl, $dst|$dst, CL}",
+ "shr{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
- "shr{w}\t{%cl, $dst|$dst, CL}",
+ "shr{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
- "shr{l}\t{%cl, $dst|$dst, CL}",
+ "shr{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
- "shr{q}\t{%cl, $dst|$dst, CL}",
+ "shr{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
}
@@ -171,17 +171,17 @@ def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
- "shr{b}\t{%cl, $dst|$dst, CL}",
+ "shr{b}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
- "shr{w}\t{%cl, $dst|$dst, CL}",
+ "shr{w}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
OpSize;
def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
- "shr{l}\t{%cl, $dst|$dst, CL}",
+ "shr{l}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
- "shr{q}\t{%cl, $dst|$dst, CL}",
+ "shr{q}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -224,19 +224,19 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
- "sar{b}\t{%cl, $dst|$dst, CL}",
+ "sar{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (sra GR8:$src1, CL))],
IIC_SR>;
def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
- "sar{w}\t{%cl, $dst|$dst, CL}",
+ "sar{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (sra GR16:$src1, CL))],
IIC_SR>, OpSize;
def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
- "sar{l}\t{%cl, $dst|$dst, CL}",
+ "sar{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (sra GR32:$src1, CL))],
IIC_SR>;
def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
- "sar{q}\t{%cl, $dst|$dst, CL}",
+ "sar{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (sra GR64:$src1, CL))],
IIC_SR>;
}
@@ -283,19 +283,19 @@ def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
- "sar{b}\t{%cl, $dst|$dst, CL}",
+ "sar{b}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
- "sar{w}\t{%cl, $dst|$dst, CL}",
+ "sar{w}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
- "sar{l}\t{%cl, $dst|$dst, CL}",
+ "sar{l}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
- "sar{q}\t{%cl, $dst|$dst, CL}",
+ "sar{q}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -349,7 +349,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
"rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
- "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
"rcl{w}\t$dst", [], IIC_SR>, OpSize;
@@ -357,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
"rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
- "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
"rcl{l}\t$dst", [], IIC_SR>;
@@ -365,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
"rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
- "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -374,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
"rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
- "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
@@ -383,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
"rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
- "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
"rcr{w}\t$dst", [], IIC_SR>, OpSize;
@@ -391,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
"rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
let Uses = [CL] in
def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
- "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
"rcr{l}\t$dst", [], IIC_SR>;
@@ -399,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
"rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
- "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
"rcr{q}\t$dst", [], IIC_SR>;
@@ -407,7 +407,7 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
"rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
- "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
} // Constraints = "$src = $dst"
@@ -448,22 +448,22 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
let Uses = [CL] in {
def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
- "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
- "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
- "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
- "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
- "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+ "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
- "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
}
} // SchedRW
} // hasSideEffects = 0
@@ -472,16 +472,16 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
// FIXME: provide shorter instructions when imm8 == 1
let Uses = [CL] in {
def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
- "rol{b}\t{%cl, $dst|$dst, CL}",
+ "rol{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
- "rol{w}\t{%cl, $dst|$dst, CL}",
+ "rol{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
- "rol{l}\t{%cl, $dst|$dst, CL}",
+ "rol{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
- "rol{q}\t{%cl, $dst|$dst, CL}",
+ "rol{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
}
@@ -525,19 +525,19 @@ def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
- "rol{b}\t{%cl, $dst|$dst, CL}",
+ "rol{b}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
- "rol{w}\t{%cl, $dst|$dst, CL}",
+ "rol{w}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
- "rol{l}\t{%cl, $dst|$dst, CL}",
+ "rol{l}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
- "rol{q}\t{%cl, $dst|$dst, %cl}",
+ "rol{q}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -582,16 +582,16 @@ def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
- "ror{b}\t{%cl, $dst|$dst, CL}",
+ "ror{b}\t{%cl, $dst|$dst, cl}",
[(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
- "ror{w}\t{%cl, $dst|$dst, CL}",
+ "ror{w}\t{%cl, $dst|$dst, cl}",
[(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
- "ror{l}\t{%cl, $dst|$dst, CL}",
+ "ror{l}\t{%cl, $dst|$dst, cl}",
[(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
- "ror{q}\t{%cl, $dst|$dst, CL}",
+ "ror{q}\t{%cl, $dst|$dst, cl}",
[(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
}
@@ -635,19 +635,19 @@ def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
- "ror{b}\t{%cl, $dst|$dst, CL}",
+ "ror{b}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
- "ror{w}\t{%cl, $dst|$dst, CL}",
+ "ror{w}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
IIC_SR>, OpSize;
def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
- "ror{l}\t{%cl, $dst|$dst, CL}",
+ "ror{l}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
IIC_SR>;
def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
- "ror{q}\t{%cl, $dst|$dst, CL}",
+ "ror{q}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
@@ -699,35 +699,35 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
let Uses = [CL] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
- "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
IIC_SHD16_REG_CL>,
TB, OpSize;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
- "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
IIC_SHD16_REG_CL>,
TB, OpSize;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
IIC_SHD32_REG_CL>, TB;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
- "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
IIC_SHD32_REG_CL>, TB;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
IIC_SHD64_REG_CL>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
IIC_SHD64_REG_CL>,
TB;
@@ -782,29 +782,29 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
let SchedRW = [WriteShiftLd, WriteRMW] in {
let Uses = [CL] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)], IIC_SHD32_MEM_CL>, TB;
def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)], IIC_SHD64_MEM_CL>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)], IIC_SHD64_MEM_CL>, TB;
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index bab3cdd..2196dc3 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -77,43 +77,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
let SchedRW = [WriteSystem] in {
let Defs = [AL], Uses = [DX] in
def IN8rr : I<0xEC, RawFrm, (outs), (ins),
- "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
+ "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
let Defs = [AX], Uses = [DX] in
def IN16rr : I<0xED, RawFrm, (outs), (ins),
- "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize;
+ "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize;
let Defs = [EAX], Uses = [DX] in
def IN32rr : I<0xED, RawFrm, (outs), (ins),
- "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
+ "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>;
let Defs = [AL] in
def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
- "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
+ "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
let Defs = [AX] in
def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
+ "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize;
let Defs = [EAX] in
def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
- "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
+ "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>;
let Uses = [DX, AL] in
def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
- "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
+ "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
let Uses = [DX, AX] in
def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
- "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
+ "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize;
let Uses = [DX, EAX] in
def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
- "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
+ "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>;
let Uses = [AL] in
def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
- "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
+ "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
let Uses = [AX] in
def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
+ "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize;
let Uses = [EAX] in
def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
- "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
+ "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>;
def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize;
@@ -248,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
"ltr{w}\t$src", [], IIC_LTR>, TB;
def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
- "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
- "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+ "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
- "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
- "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
- "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
- "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
- "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+ "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
OpSize;
def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
- "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+ "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
- "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
+ "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
- "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
- "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
+ "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB;
def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
- "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+ "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
- "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
+ "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB;
def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
- "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
+ "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB;
// No "pop cs" instruction.
def POPSS16 : I<0x17, RawFrm, (outs), (ins),
- "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
OpSize, Requires<[In32BitMode]>;
def POPSS32 : I<0x17, RawFrm, (outs), (ins),
- "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+ "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
Requires<[In32BitMode]>;
def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
- "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+ "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
OpSize, Requires<[In32BitMode]>;
def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
- "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+ "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
Requires<[In32BitMode]>;
def POPES16 : I<0x07, RawFrm, (outs), (ins),
- "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+ "pop{w}\t{%es|es}", [], IIC_POP_SR>,
OpSize, Requires<[In32BitMode]>;
def POPES32 : I<0x07, RawFrm, (outs), (ins),
- "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+ "pop{l}\t{%es|es}", [], IIC_POP_SR>,
Requires<[In32BitMode]>;
def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
- "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
+ "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB;
def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
- "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+ "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
- "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
+ "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB;
def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
- "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
+ "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB;
def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
- "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+ "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
- "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
+ "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB;
def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 363a190..59a6f1e 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -37,3 +37,10 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
[(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a453245..c7c00b5 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -254,6 +254,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
Inst.addOperand(Saved);
}
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+ unsigned NewOpcode = 0;
+ unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
+ if (Op0 == X86::AX && Op1 == X86::AL)
+ NewOpcode = X86::CBW;
+ break;
+ case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
+ if (Op0 == X86::EAX && Op1 == X86::AX)
+ NewOpcode = X86::CWDE;
+ break;
+ case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+ if (Op0 == X86::RAX && Op1 == X86::EAX)
+ NewOpcode = X86::CDQE;
+ break;
+ }
+
+ if (NewOpcode != 0) {
+ Inst = MCInst();
+ Inst.setOpcode(NewOpcode);
+ }
+}
+
/// \brief Simplify things like MOV32rm to MOV32o32a.
static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
unsigned Opcode) {
@@ -557,6 +585,13 @@ ReSimplify:
case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break;
case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break;
+ // Try to shrink some forms of movsx.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr32:
+ SimplifyMOVSX(OutMI);
+ break;
+
case X86::MORESTACK_RET:
OutMI.setOpcode(X86::RET);
break;
@@ -654,13 +689,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(Mang, *MF, *this);
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
- if (isVerbose() && OutStreamer.hasRawTextSupport()) {
- std::string TmpStr;
- raw_string_ostream OS(TmpStr);
- PrintDebugValueComment(MI, OS);
- OutStreamer.EmitRawText(StringRef(OS.str()));
- }
- return;
+ llvm_unreachable("Should be handled target independently");
// Emit nothing here but a comment if we can.
case X86::Int_MemBarrier:
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index eacae2c..0923310 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -241,6 +241,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
case CallingConv::Intel_OCL_BI: {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+ if (HasAVX512 && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+ if (HasAVX512 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX512_SaveList;
if (HasAVX && IsWin64)
return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
if (HasAVX && Is64Bit)
@@ -275,8 +280,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
if (CC == CallingConv::Intel_OCL_BI) {
+ if (IsWin64 && HasAVX512)
+ return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+ if (Is64Bit && HasAVX512)
+ return CSR_64_Intel_OCL_BI_AVX512_RegMask;
if (IsWin64 && HasAVX)
return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
if (Is64Bit && HasAVX)
@@ -344,14 +354,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(X86::GS);
// Mark the floating point stack registers as reserved.
- Reserved.set(X86::ST0);
- Reserved.set(X86::ST1);
- Reserved.set(X86::ST2);
- Reserved.set(X86::ST3);
- Reserved.set(X86::ST4);
- Reserved.set(X86::ST5);
- Reserved.set(X86::ST6);
- Reserved.set(X86::ST7);
+ for (unsigned n = 0; n != 8; ++n)
+ Reserved.set(X86::ST0 + n);
// Reserve the registers that only exist in 64-bit mode.
if (!Is64Bit) {
@@ -364,19 +368,17 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (unsigned n = 0; n != 8; ++n) {
// R8, R9, ...
- static const uint16_t GPR64[] = {
- X86::R8, X86::R9, X86::R10, X86::R11,
- X86::R12, X86::R13, X86::R14, X86::R15
- };
- for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+ for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
Reserved.set(*AI);
// XMM8, XMM9, ...
- static const uint16_t XMMReg[] = {
- X86::XMM8, X86::XMM9, X86::XMM10, X86::XMM11,
- X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15
- };
- for (MCRegAliasIterator AI(XMMReg[n], this, true); AI.isValid(); ++AI)
+ for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+ Reserved.set(*AI);
+ }
+ }
+ if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+ for (unsigned n = 16; n != 32; ++n) {
+ for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
Reserved.set(*AI);
}
}
@@ -409,10 +411,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
}
bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+ return false;
+
const MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- if (!MF.getTarget().Options.RealignStack)
- return false;
// Stack realignment requires a frame pointer. If we already started
// register allocation with frame pointer elimination, it is too late now.
@@ -690,4 +693,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
}
}
}
+
+unsigned get512BitSuperRegister(unsigned Reg) {
+ if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+ return X86::ZMM0 + (Reg - X86::XMM0);
+ if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+ return X86::ZMM0 + (Reg - X86::YMM0);
+ if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+ return Reg;
+ llvm_unreachable("Unexpected SIMD register");
+}
+
}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 6a1b328..fb17682 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -137,6 +137,9 @@ public:
// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index fbbb257..b802728 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -26,6 +26,7 @@ let Namespace = "X86" in {
def sub_16bit : SubRegIndex<16>;
def sub_32bit : SubRegIndex<32>;
def sub_xmm : SubRegIndex<128>;
+ def sub_ymm : SubRegIndex<256>;
}
//===----------------------------------------------------------------------===//
@@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
} // CostPerUse
-// YMM Registers, used by AVX instructions
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
let SubRegIndices = [sub_xmm] in {
-def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>;
-def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>;
-def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>;
-def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>;
-def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>;
-def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>;
-def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>;
-def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>;
-def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>;
-def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>;
-def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
-def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
-def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
-def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
-def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
-def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+ foreach Index = 0-31 in {
+ def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+ foreach Index = 0-31 in {
+ def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+ DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+ }
}
+ // Mask Registers, used by AVX-512 instructions.
+ def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+ def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+ def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+ def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+ def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+ def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+ def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+ def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
let Aliases = A;
}
@@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
let CopyCost = -1; // Don't allow copying of status registers.
let isAllocatable = 0;
}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512,
+ (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ 256, (sequence "YMM%u", 0, 31)>;
+
+def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>;
+def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>;
+
+def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>;
+def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>;
+
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 84c9203..62ba2bc 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -16,9 +16,8 @@ def HaswellModel : SchedMachineModel {
// All x86 instructions are modeled as a single micro-op, and HW can decode 4
// instructions per cycle.
let IssueWidth = 4;
- let MinLatency = 0; // 0 = Out-of-order execution.
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
let LoadLatency = 4;
- let ILPWindow = 30;
let MispredictPenalty = 16;
}
@@ -50,6 +49,12 @@ def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>;
def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+ HWPort5, HWPort6, HWPort7]> {
+ let BufferSize=60;
+}
+
// Integer division issued on port 0.
def HWDivider : ProcResource<1>;
@@ -86,6 +91,7 @@ def : WriteRes<WriteZero, []>;
defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
defm : HWWriteResPair<WriteShift, HWPort056, 1>;
defm : HWWriteResPair<WriteJump, HWPort5, 1>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index b36b3ad..52ead94 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -17,9 +17,8 @@ def SandyBridgeModel : SchedMachineModel {
// instructions per cycle.
// FIXME: Identify instructions that aren't a single fused micro-op.
let IssueWidth = 4;
- let MinLatency = 0; // 0 = Out-of-order execution.
+ let MicroOpBufferSize = 168; // Based on the reorder buffer.
let LoadLatency = 4;
- let ILPWindow = 20;
let MispredictPenalty = 16;
}
@@ -46,6 +45,11 @@ def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+ let BufferSize=54;
+}
+
// Integer division issued on port 0.
def SBDivider : ProcResource<1>;
@@ -82,6 +86,7 @@ def : WriteRes<WriteZero, []>;
defm : SBWriteResPair<WriteALU, SBPort015, 1>;
defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+def : WriteRes<WriteIMulH, []> { let Latency = 3; }
defm : SBWriteResPair<WriteShift, SBPort05, 1>;
defm : SBWriteResPair<WriteJump, SBPort5, 1>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 9f2c781..ceb2e05 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -42,6 +42,7 @@ multiclass X86SchedWritePair {
// Arithmetic.
defm WriteALU : X86SchedWritePair; // Simple integer ALU op.
defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def WriteIMulH : SchedWrite; // Integer multiplication, high part.
defm WriteIDiv : X86SchedWritePair; // Integer division.
def WriteLEA : SchedWrite; // LEA instructions can't fold loads.
@@ -550,8 +551,9 @@ def IIC_NOP : InstrItinClass;
// Resources beyond the decoder operate on micro-ops and are bufferred
// so adjacent micro-ops don't directly compete.
//
-// MinLatency=0 indicates that RAW dependencies can be decoded in the
-// same cycle.
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
//
// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
// indicates high latency opcodes. Alternatively, InstrItinData
@@ -559,17 +561,12 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact.
//
-// ILPWindow=10 is an arbitrary threshold that approximates cycles of
-// latency hidden by instruction buffers. The actual value is not very
-// important but should be zero for inorder and nonzero for OOO processors.
-//
// The GenericModel contains no instruciton itineraries.
def GenericModel : SchedMachineModel {
let IssueWidth = 4;
- let MinLatency = 0;
+ let MicroOpBufferSize = 32;
let LoadLatency = 4;
let HighLatency = 10;
- let ILPWindow = 10;
}
include "X86ScheduleAtom.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index cb0960a..14a1471 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -525,11 +525,9 @@ def AtomItineraries : ProcessorItineraries<
// Atom machine model.
def AtomModel : SchedMachineModel {
let IssueWidth = 2; // Allows 2 instructions per scheduling group.
- let MinLatency = 1; // InstrStage cycles overrides MinLatency.
- // OperandCycles may be used for expected latency.
+ let MicroOpBufferSize = 0; // In-order execution, always hide latency.
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles.
- let ILPWindow = 0; // Always try to hide expected latency.
let Itineraries = AtomItineraries;
}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 74da2a9..fae90f2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -477,6 +477,9 @@ void X86Subtarget::initializeEnvironment() {
HasBMI2 = false;
HasRTM = false;
HasHLE = false;
+ HasERI = false;
+ HasCDI = false;
+ HasPFI=false;
HasADX = false;
HasPRFCHW = false;
HasRDSEED = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 66832b9..8793238 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -42,7 +42,7 @@ enum Style {
class X86Subtarget : public X86GenSubtargetInfo {
protected:
enum X86SSEEnum {
- NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
};
enum X863DNowEnum {
@@ -169,6 +169,15 @@ protected:
/// address generation (AG) time.
bool LEAUsesAG;
+ /// Processor has AVX-512 PreFetch Instructions
+ bool HasPFI;
+
+ /// Processor has AVX-512 Exponential and Reciprocal Instructions
+ bool HasERI;
+
+ /// Processor has AVX-512 Conflict Detection Instructions
+ bool HasCDI;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
@@ -249,6 +258,7 @@ public:
bool hasSSE42() const { return X86SSELevel >= SSE42; }
bool hasAVX() const { return X86SSELevel >= AVX; }
bool hasAVX2() const { return X86SSELevel >= AVX2; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512; }
bool hasFp256() const { return hasAVX(); }
bool hasInt256() const { return hasAVX2(); }
bool hasSSE4A() const { return HasSSE4A; }
@@ -282,6 +292,9 @@ public:
bool padShortFunctions() const { return PadShortFunctions; }
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
+ bool hasCDI() const { return HasCDI; }
+ bool hasPFI() const { return HasPFI; }
+ bool hasERI() const { return HasERI; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
@@ -338,7 +351,13 @@ public:
}
bool isPICStyleStubAny() const {
return PICStyle == PICStyles::StubDynamicNoPIC ||
- PICStyle == PICStyles::StubPIC; }
+ PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
+ CC == CallingConv::X86_64_Win64;
+ }
/// ClassifyGlobalReference - Classify a global variable reference for the
/// current subtarget according to how we should reference it in a non-pcrel
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0422a61..49ebd1a 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -132,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
// Add first the target-independent BasicTTI pass, then our X86 pass. This
// allows the X86 pass to delegate to the target independent layer when
// appropriate.
- PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+ PM.add(createBasicTargetTransformInfoPass(this));
PM.add(createX86TargetTransformInfoPass(this));
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 871dacd..a19c5a6 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(TM.Options.UseInitArray);
}
+
+const MCExpr *
+X86LinuxTargetObjectFile::getDebugThreadLocalSymbol(
+ const MCSymbol *Sym) const {
+ return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 9d26d38..79c861d 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -36,6 +36,9 @@ namespace llvm {
/// and x86-64.
class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+ /// \brief Describe a TLS variable address within debug info.
+ virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index eba9d78..3bbddad 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &);
namespace {
class X86TTI : public ImmutablePass, public TargetTransformInfo {
- const X86TargetMachine *TM;
const X86Subtarget *ST;
const X86TargetLowering *TLI;
@@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+ X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
llvm_unreachable("This pass cannot be directly constructed");
}
X86TTI(const X86TargetMachine *TM)
- : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+ : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
TLI(TM->getTargetLowering()) {
initializeX86TTIPass(*PassRegistry::getPassRegistry());
}
@@ -101,6 +100,8 @@ public:
unsigned Alignment,
unsigned AddressSpace) const;
+ virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
+
/// @}
};
@@ -196,6 +197,16 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
{ ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
{ ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
};
// Look for AVX2 lowering tricks.
@@ -258,6 +269,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized.
{ ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized.
{ ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
+ // in the process we will often end up having to spilling regular
+ // registers. The overhead of division is going to dominate most kernels
+ // anyways so try hard to prevent vectorization of division - it is
+ // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+ // to hide "20 cycles" for each lane.
+ { ISD::SDIV, MVT::v16i8, 16*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v16i8, 16*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
};
if (ST->hasSSE2()) {
@@ -467,19 +493,22 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
};
if (ST->hasAVX2()) {
- int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * AVX2CostTbl[Idx].Cost;
}
if (ST->hasAVX()) {
- int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * AVX1CostTbl[Idx].Cost;
}
if (ST->hasSSE42()) {
- int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl),
+ ISD, MTy);
if (Idx != -1)
return LT.first * SSE42CostTbl[Idx].Cost;
}
@@ -511,8 +540,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
}
+unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
+ bool Extract) const {
+ assert (Ty->isVectorTy() && "Can only scalarize vectors");
+ unsigned Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const {
+ // Handle non power of two vectors such as <3 x float>
+ if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+ unsigned NumElem = VTy->getVectorNumElements();
+
+ // Handle a few common cases:
+ // <3 x float>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+ // Cost = 64 bit store + extract + 32 bit store.
+ return 3;
+
+ // <3 x double>
+ if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+ // Cost = 128 bit store + unpack + 64 bit store.
+ return 3;
+
+ // Assume that all other non power-of-two numbers are scalarized.
+ if (!isPowerOf2_32(NumElem)) {
+ unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
+ VTy->getScalarType(),
+ Alignment,
+ AddressSpace);
+ unsigned SplitCost = getScalarizationOverhead(Src,
+ Opcode == Instruction::Load,
+ Opcode==Instruction::Store);
+ return NumElem * Cost + SplitCost;
+ }
+ }
+
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -528,3 +600,16 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
return Cost;
}
+
+unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+ // Address computations in vectorized code with non-consecutive addresses will
+ // likely result in more instructions compared to scalar code where the
+ // computation can more often be merged into the index mode. The resulting
+ // extra micro-ops can significantly decrease throughput.
+ unsigned NumVectorInstToHideOverhead = 10;
+
+ if (Ty->isVectorTy() && IsComplex)
+ return NumVectorInstToHideOverhead;
+
+ return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
+}
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 0f77948..477f75a 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
}
static bool isYmmReg(unsigned Reg) {
- if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
- return true;
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM31);
+}
- return false;
+static bool isZmmReg(unsigned Reg) {
+ return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31);
}
static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
E = MRI.livein_end(); I != E; ++I)
- if (isYmmReg(I->first))
+ if (isYmmReg(I->first) || isZmmReg(I->first))
return true;
return false;
}
static bool clobbersAllYmmRegs(const MachineOperand &MO) {
- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+ for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) {
if (!MO.clobbersPhysReg(reg))
return false;
}