diff options
Diffstat (limited to 'lib/Target/X86')
54 files changed, 3371 insertions, 964 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 263eb5e..ad83d97 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -9,6 +9,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" @@ -830,6 +831,18 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVZ32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + bool isMemVZ64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + + bool isMem512() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 512); + } bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && @@ -890,6 +903,16 @@ struct X86Operand : public MCParsedAsmOperand { addMemOperands(Inst, N); } + void addMemVZ32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVZ64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem512Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 7cb71f0..7e20151 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -53,7 +53,7 @@ endif() add_llvm_target(X86CodeGen ${sources}) -add_dependencies(LLVMX86CodeGen intrinsics_gen) +add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen) add_subdirectory(AsmParser) add_subdirectory(Disassembler) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index ca71c4f..82af6fa 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -286,6 +286,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_XMM256: mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); return; + case TYPE_XMM512: + mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4))); + return; case TYPE_REL8: isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; @@ -443,6 +446,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, EA_BASES_64BIT REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY } } else { @@ -565,6 +569,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM64: case TYPE_XMM128: case TYPE_XMM256: + case TYPE_XMM512: case TYPE_DEBUGREG: case TYPE_CONTROLREG: return translateRMRegister(mcInst, insn); @@ -683,6 +688,15 @@ static bool translateInstruction(MCInst &mcInst, } mcInst.setOpcode(insn.instructionID); + // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 + // prefix bytes should be disassembled as xrelease and xacquire then set the + // opcode to those instead of the rep and repne opcodes. + if (insn.xAcquireRelease) { + if(mcInst.getOpcode() == X86::REP_PREFIX) + mcInst.setOpcode(X86::XRELEASE_PREFIX); + else if(mcInst.getOpcode() == X86::REPNE_PREFIX) + mcInst.setOpcode(X86::XACQUIRE_PREFIX); + } int index; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index e40edba..bb195ee 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -328,6 +328,27 @@ static int readPrefixes(struct InternalInstruction* insn) { break; if (lookAtByte(insn, &nextByte)) return -1; + /* + * If the byte is 0xf2 or 0xf3, and any of the following conditions are + * met: + * - it is followed by a LOCK (0xf0) prefix + * - it is followed by an xchg instruction + * then it should be disassembled as a xacquire/xrelease not repne/rep. + */ + if ((byte == 0xf2 || byte == 0xf3) && + ((nextByte == 0xf0) | + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) + insn->xAcquireRelease = TRUE; + /* + * Also if the byte is 0xf3, and the following condition is met: + * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or + * "mov mem, imm" (opcode 0xc6/0xc7) instructions. + * then it should be disassembled as an xrelease not rep. + */ + if (byte == 0xf3 && + (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) + insn->xAcquireRelease = TRUE; if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { if (consumeByte(insn, &nextByte)) return -1; @@ -1234,6 +1255,8 @@ static int readModRM(struct InternalInstruction* insn) { return prefix##_EAX + index; \ case TYPE_R64: \ return prefix##_RAX + index; \ + case TYPE_XMM512: \ + return prefix##_ZMM0 + index; \ case TYPE_XMM256: \ return prefix##_YMM0 + index; \ case TYPE_XMM128: \ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 407ead3..dcb6aad 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -219,7 +219,23 @@ extern "C" { ENTRY(XMM12) \ ENTRY(XMM13) \ ENTRY(XMM14) \ - ENTRY(XMM15) + ENTRY(XMM15) \ + ENTRY(XMM16) \ + ENTRY(XMM17) \ + ENTRY(XMM18) \ + ENTRY(XMM19) \ + ENTRY(XMM20) \ + ENTRY(XMM21) \ + ENTRY(XMM22) \ + ENTRY(XMM23) \ + ENTRY(XMM24) \ + ENTRY(XMM25) \ + ENTRY(XMM26) \ + ENTRY(XMM27) \ + ENTRY(XMM28) \ + ENTRY(XMM29) \ + ENTRY(XMM30) \ + ENTRY(XMM31) #define REGS_YMM \ ENTRY(YMM0) \ @@ -237,7 +253,57 @@ extern "C" { ENTRY(YMM12) \ ENTRY(YMM13) \ ENTRY(YMM14) \ - ENTRY(YMM15) + ENTRY(YMM15) \ + ENTRY(YMM16) \ + ENTRY(YMM17) \ + ENTRY(YMM18) \ + ENTRY(YMM19) \ + ENTRY(YMM20) \ + ENTRY(YMM21) \ + ENTRY(YMM22) \ + ENTRY(YMM23) \ + ENTRY(YMM24) \ + ENTRY(YMM25) \ + ENTRY(YMM26) \ + ENTRY(YMM27) \ + ENTRY(YMM28) \ + ENTRY(YMM29) \ + ENTRY(YMM30) \ + ENTRY(YMM31) + +#define REGS_ZMM \ + ENTRY(ZMM0) \ + ENTRY(ZMM1) \ + ENTRY(ZMM2) \ + ENTRY(ZMM3) \ + ENTRY(ZMM4) \ + ENTRY(ZMM5) \ + ENTRY(ZMM6) \ + ENTRY(ZMM7) \ + ENTRY(ZMM8) \ + ENTRY(ZMM9) \ + ENTRY(ZMM10) \ + ENTRY(ZMM11) \ + ENTRY(ZMM12) \ + ENTRY(ZMM13) \ + ENTRY(ZMM14) \ + ENTRY(ZMM15) \ + ENTRY(ZMM16) \ + ENTRY(ZMM17) \ + ENTRY(ZMM18) \ + ENTRY(ZMM19) \ + ENTRY(ZMM20) \ + ENTRY(ZMM21) \ + ENTRY(ZMM22) \ + ENTRY(ZMM23) \ + ENTRY(ZMM24) \ + ENTRY(ZMM25) \ + ENTRY(ZMM26) \ + ENTRY(ZMM27) \ + ENTRY(ZMM28) \ + ENTRY(ZMM29) \ + ENTRY(ZMM30) \ + ENTRY(ZMM31) #define REGS_SEGMENT \ ENTRY(ES) \ @@ -285,6 +351,7 @@ extern "C" { REGS_MMX \ REGS_XMM \ REGS_YMM \ + REGS_ZMM \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ @@ -319,6 +386,7 @@ typedef enum { ALL_EA_BASES REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY SIB_INDEX_max } SIBIndex; @@ -457,6 +525,8 @@ struct InternalInstruction { uint64_t necessaryPrefixLocation; /* The segment override type */ SegmentOverride segmentOverride; + /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */ + BOOL xAcquireRelease; /* Sizes of various critical pieces of data, in bytes */ uint8_t registerSize; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 23dfe4b..d291441 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -116,8 +116,106 @@ enum attributeBits { ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ - ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") - + ENUM_ENTRY(IC_VEX_L_W, 3, "requires VEX, L and W") \ + ENUM_ENTRY(IC_VEX_L_W_XS, 4, "requires VEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_VEX_L_W_XD, 4, "requires VEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 4, "requires VEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \ + ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \ + ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, typedef enum { @@ -224,6 +322,7 @@ struct ContextDecision { ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ ENUM_ENTRY(ENCODING_CW, "2-byte") \ ENUM_ENTRY(ENCODING_CD, "4-byte") \ @@ -321,6 +420,9 @@ struct ContextDecision { ENUM_ENTRY(TYPE_XMM64, "8-byte") \ ENUM_ENTRY(TYPE_XMM128, "16-byte") \ ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM512, "64-byte") \ + ENUM_ENTRY(TYPE_VK8, "8-bit") \ + ENUM_ENTRY(TYPE_VK16, "16-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index e357710..b9d0082 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Try to print any aliases first. if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); - + // Next always print the annotation. printAnnotation(OS, Annot); @@ -139,8 +139,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + O << formatHex((uint64_t)Address); } else { // Otherwise, just print the expression. @@ -159,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); - + if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); - + } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << markup("<imm:") @@ -177,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &IndexReg = MI->getOperand(Op+2); const MCOperand &DispSpec = MI->getOperand(Op+3); const MCOperand &SegReg = MI->getOperand(Op+4); - + O << markup("<mem:"); // If this has a segment register, print it. @@ -185,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, printOperand(MI, Op+4, O); O << ':'; } - + if (DispSpec.isImm()) { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) @@ -194,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); O << *DispSpec.getExpr(); } - + if (IndexReg.getReg() || BaseReg.getReg()) { O << '('; if (BaseReg.getReg()) printOperand(MI, Op, O); - + if (IndexReg.getReg()) { O << ','; printOperand(MI, Op+2, O); unsigned ScaleVal = MI->getOperand(Op+1).getImm(); if (ScaleVal != 1) { O << ',' - << markup("<imm:") + << markup("<imm:") << ScaleVal // never printed in hex. - << markup(">"); + << markup(">"); } } O << ')'; diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 8e09183..8d05256 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -65,6 +65,9 @@ public: void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -80,6 +83,9 @@ public: void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } }; } diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 141f4a4..9dfc9a9 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -119,7 +119,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - O << Op.getImm(); + O << formatImm(Op.getImm()); else { assert(Op.isExpr() && "unknown pcrel immediate operand"); // If a symbolic branch target was added as a constant expression then print @@ -127,8 +127,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); + O << formatHex((uint64_t)Address); } else { // Otherwise, just print the expression. @@ -137,18 +136,13 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, } } -static void PrintRegName(raw_ostream &O, StringRef RegName) { - for (unsigned i = 0, e = RegName.size(); i != e; ++i) - O << (char)toupper(RegName[i]); -} - void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - PrintRegName(O, getRegisterName(Op.getReg())); + printRegName(O, Op.getReg()); } else if (Op.isImm()) { - O << Op.getImm(); + O << formatImm((int64_t)Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << *Op.getExpr(); @@ -200,7 +194,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, DispVal = -DispVal; } } - O << DispVal; + O << formatImm(DispVal); } } diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index bb769eb..45beeda 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -41,52 +41,60 @@ public: void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "OPAQUE PTR "; + O << "opaque ptr "; printMemReference(MI, OpNo, O); } void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "BYTE PTR "; + O << "byte ptr "; printMemReference(MI, OpNo, O); } void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "WORD PTR "; + O << "word ptr "; printMemReference(MI, OpNo, O); } void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XWORD PTR "; + O << "xword ptr "; printMemReference(MI, OpNo, O); } void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } }; diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index d8f7278..25d1af3 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -462,20 +462,54 @@ namespace X86II { // prefix. Usually used for scalar instructions. Needed by disassembler. VEX_LIG = 1U << 6, + // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field + // with following encoding: + // - 00 V128 + // - 01 V256 + // - 10 V512 + // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros. + // this will save 1 tsflag bit + + // VEX_EVEX - Specifies that this instruction use EVEX form which provides + // syntax support up to 32 512-bit register operands and up to 7 16-bit + // mask operands as well as source operand data swizzling/memory operand + // conversion, eviction hint, and rounding mode. + EVEX = 1U << 7, + + // EVEX_K - Set if this instruction requires masking + EVEX_K = 1U << 8, + + // EVEX_Z - Set if this instruction has EVEX.Z field set. + EVEX_Z = 1U << 9, + + // EVEX_L2 - Set if this instruction has EVEX.L' field set. + EVEX_L2 = 1U << 10, + + // EVEX_B - Set if this instruction has EVEX.B field set. + EVEX_B = 1U << 11, + + // EVEX_CD8E - compressed disp8 form, element-size + EVEX_CD8EShift = VEXShift + 12, + EVEX_CD8EMask = 3, + + // EVEX_CD8V - compressed disp8 form, vector-width + EVEX_CD8VShift = EVEX_CD8EShift + 2, + EVEX_CD8VMask = 7, + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 7, + Has3DNow0F0FOpcode = 1U << 17, /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4 = 1U << 8, + MemOp4 = 1U << 18, /// XOP - Opcode prefix used by XOP instructions. - XOP = 1U << 9 + XOP = 1U << 19 }; @@ -533,12 +567,19 @@ namespace X86II { unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb CurOp += 2; - } + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; return CurOp; } @@ -569,12 +610,15 @@ namespace X86II { case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). if (HasMemOp4) ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). - + if (HasEVEX_K) + ++FirstMemOp;// Skip the mask register // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || // Opcode == X86::LEA16r || Opcode == X86::LEA32r) @@ -611,6 +655,14 @@ namespace X86II { /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or /// higher) register? e.g. r8, xmm8, xmm13, etc. inline bool isX86_64ExtendedReg(unsigned RegNo) { + if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) || + (RegNo > X86::XMM23 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM7 && RegNo <= X86::YMM15) || + (RegNo > X86::YMM23 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) || + (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31)) + return true; + switch (RegNo) { default: break; case X86::R8: case X86::R9: case X86::R10: case X86::R11: @@ -621,16 +673,21 @@ namespace X86II { case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: - case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: - case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: - case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: - case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: return true; } return false; } + + /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher) + /// registers? e.g. zmm21, etc. + static inline bool is32ExtendedReg(unsigned RegNo) { + return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM15 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); + } + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index de80dd8..b400b87 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -101,7 +101,18 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, } else { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); - case FK_Data_8: Type = ELF::R_X86_64_64; break; + case FK_Data_8: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_64; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_X86_64_DTPOFF64; + break; + } + break; case X86::reloc_signed_4byte: switch (Modifier) { default: diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 016af71..8515879 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -53,7 +53,7 @@ public: } unsigned GetX86RegNum(const MCOperand &MO) const { - return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7; + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; } // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range @@ -77,6 +77,14 @@ public: return (~SrcRegNum) & 0xf; } + unsigned char getWriteMaskRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + assert(X86::K0 != MI.getOperand(OpNum).getReg() && + "Invalid mask register as write-mask!"); + unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum)); + return MaskRegNum; + } + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { OS << (char)C; ++CurByte; @@ -152,6 +160,52 @@ static bool isDisp8(int Value) { return Value == (signed char)Value; } +/// isCDisp8 - Return true if this signed displacement fits in a 8-bit +/// compressed dispacement field. +static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { + assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) && + "Compressed 8-bit displacement is only valid for EVEX inst."); + + unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask; + unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask; + + if (CD8V == 0 && CD8E == 0) { + CValue = Value; + return isDisp8(Value); + } + + unsigned MemObjSize = 1U << CD8E; + if (CD8V & 4) { + // Fixed vector length + MemObjSize *= 1U << (CD8V & 0x3); + } else { + // Modified vector length + bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B; + if (!EVEX_b) { + unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0; + EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0; + assert(EVEX_LL < 3 && ""); + + unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize; + NumElems /= 1U << (CD8V & 0x3); + + MemObjSize *= NumElems; + } + } + + unsigned MemObjMask = MemObjSize - 1; + assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size."); + + if (Value & MemObjMask) // Unaligned offset + return false; + Value /= MemObjSize; + bool Ret = (Value == (signed char)Value); + + if (Ret) + CValue = Value; + return Ret; +} + /// getImmFixupKind - Return the appropriate fixup kind to use for an immediate /// in an instruction with the specified TSFlags. static MCFixupKind getImmFixupKind(uint64_t TSFlags) { @@ -318,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode @@ -378,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (Disp.isImm() && isDisp8(Disp.getImm())) { - EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); - return; + if (Disp.isImm()) { + if (!HasEVEX && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + // Try EVEX compressed 8-bit displacement first; if failed, fall back to + // 32-bit displacement. + int CDisp8 = 0; + if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + CDisp8 - Disp.getImm()); + return; + } } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] @@ -397,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, bool ForceDisp32 = false; bool ForceDisp8 = false; + int CDisp8 = 0; + int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. @@ -412,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); - } else if (isDisp8(Disp.getImm())) { + } else if (!HasEVEX && isDisp8(Disp.getImm())) { // Emit the disp8 encoding. EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); @@ -445,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Do we need to output a displacement? if (ForceDisp8) - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset); else if (ForceDisp32 || Disp.getImm() != 0) EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, Fixups); @@ -457,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; @@ -468,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0: Same as REX_R=1 (64 bit mode only) // unsigned char VEX_R = 0x1; + unsigned char EVEX_R2 = 0x1; // VEX_X: equivalent to REX.X, only used when a // register is used for index in SIB Byte. @@ -504,6 +580,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // VEX_4V (VEX vvvv field): a register specifier // (in 1's complement form) or 1111 if unused. unsigned char VEX_4V = 0xf; + unsigned char EVEX_V2 = 0x1; // VEX_L (Vector Length): // @@ -511,6 +588,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 1: 256-bit vector // unsigned char VEX_L = 0; + unsigned char EVEX_L2 = 0; // VEX_PP: opcode extension providing equivalent // functionality of a SIMD prefix @@ -522,6 +600,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned char VEX_PP = 0; + // EVEX_U + unsigned char EVEX_U = 1; // Always '1' so far + + // EVEX_z + unsigned char EVEX_z = 0; + + // EVEX_b + unsigned char EVEX_b = 0; + + // EVEX_aaa + unsigned char EVEX_aaa = 0; + // Encode the operand size opcode prefix as needed. if (TSFlags & X86II::OpSize) VEX_PP = 0x01; @@ -534,6 +624,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2)) + EVEX_L2 = 1; + + if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z)) + EVEX_z = 1; + + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B)) + EVEX_b = 1; switch (TSFlags & X86II::Op0Mask) { default: llvm_unreachable("Invalid prefix!"); @@ -580,12 +678,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb CurOp += 2; - } + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; switch (TSFlags & X86II::FormMask) { case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); @@ -595,18 +700,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MemAddr, src1(VEX_4V), src2(ModR/M) // MemAddr, src1(ModR/M), imm8 // - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrBaseReg).getReg())) VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; + + CurOp += X86::AddrNumOperands; - CurOp = X86::AddrNumOperands; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } const MCOperand &MO = MI.getOperand(CurOp); - if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) - VEX_R = 0x0; + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MO.getReg())) + EVEX_R2 = 0x0; + } break; } case X86II::MRMSrcMem: @@ -619,11 +741,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // FMA4: // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); - if (HasVEX_4V) + if (HasVEX_4V) { VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -631,6 +763,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; if (HasVEX_4VOp3) // Instruction format for 4VOp3: @@ -647,8 +782,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MRM[0-9]m instructions forms: // MemAddr // src1(VEX_4V), MemAddr - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 0); + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + } + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -669,16 +811,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (HasMemOp4) // Skip second register source (encoded in I8IMM) CurOp++; if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; if (HasVEX_4VOp3) VEX_4V = getVEXRegisterEncoding(MI, CurOp); @@ -690,13 +843,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M), src1(VEX_4V), src2(ModR/M) if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; break; case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -704,9 +868,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::MRM6r: case X86II::MRM7r: // MRM0r-MRM7r instructions forms: // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, 0); - if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; break; default: // RawFrm break; @@ -715,29 +888,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // Emit segment override opcode prefix as needed. EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); - // VEX opcode prefix can have 2 or 3 bytes - // - // 3 bytes: - // +-----+ +--------------+ +-------------------+ - // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - // 2 bytes: - // +-----+ +-------------------+ - // | C5h | | R | vvvv | L | pp | - // +-----+ +-------------------+ - // - unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + if (!HasEVEX) { + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix - EmitByte(0xC5, CurByte, OS); - EmitByte(LastByte | (VEX_R << 7), CurByte, OS); - return; - } + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } - // 3 byte VEX prefix - EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); - EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + // 3 byte VEX prefix + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + } else { + // EVEX opcode prefix can have 4 bytes + // + // +-----+ +--------------+ +-------------------+ +------------------------+ + // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | + // +-----+ +--------------+ +-------------------+ +------------------------+ + assert((VEX_5M & 0x3) == VEX_5M + && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + + VEX_5M &= 0x3; + + EmitByte(0x62, CurByte, OS); + EmitByte((VEX_R << 7) | + (VEX_X << 6) | + (VEX_B << 5) | + (EVEX_R2 << 4) | + VEX_5M, CurByte, OS); + EmitByte((VEX_W << 7) | + (VEX_4V << 3) | + (EVEX_U << 2) | + VEX_PP, CurByte, OS); + EmitByte((EVEX_z << 7) | + (EVEX_L2 << 6) | + (VEX_L << 5) | + (EVEX_b << 4) | + (EVEX_V2 << 3) | + EVEX_aaa, CurByte, OS); + } } /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 @@ -1007,6 +1209,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; const unsigned MemOp4_I8IMMOperand = 2; + // It uses the EVEX.aaa field? + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); + // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; @@ -1057,6 +1263,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1069,6 +1278,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + X86::AddrNumOperands; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1082,6 +1294,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1100,6 +1315,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRMSrcMem: { int AddrOperands = X86::AddrNumOperands; unsigned FirstMemOp = CurOp+1; + + if (HasEVEX_K) { // Skip writemask + ++AddrOperands; + ++FirstMemOp; + } + if (HasVEX_4V) { ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 496b704..adfa7fa 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -517,37 +517,6 @@ to <2 x i64> ops being so bad. //===---------------------------------------------------------------------===// -'select' on vectors and scalars could be a whole lot better. We currently -lower them to conditional branches. On x86-64 for example, we compile this: - -double test(double a, double b, double c, double d) { return a<b ? c : d; } - -to: - -_test: - ucomisd %xmm0, %xmm1 - ja LBB1_2 # entry -LBB1_1: # entry - movapd %xmm3, %xmm2 -LBB1_2: # entry - movapd %xmm2, %xmm0 - ret - -instead of: - -_test: - cmpltsd %xmm1, %xmm0 - andpd %xmm0, %xmm2 - andnpd %xmm3, %xmm0 - orpd %xmm2, %xmm0 - ret - -For unpredictable branches, the later is much more efficient. This should -just be a matter of having scalar sse map to SELECT_CC and custom expanding -or iseling it. - -//===---------------------------------------------------------------------===// - LLVM currently generates stack realignment code, when it is not necessary needed. The problem is that we need to know about stack alignment too early, before RA runs. diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index c865500..461ea9b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -86,6 +86,19 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512", + "Enable AVX-512 instructions", + [FeatureAVX2]>; +def FeatureERI : SubtargetFeature<"avx-512-eri", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx-512-cdi", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx-512-pfi", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; + def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -227,6 +240,15 @@ def : ProcessorModel<"core-avx2", HaswellModel, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; +// KNL +// FIXME: define KNL model +def : ProcessorModel<"knl", HaswellModel, + [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; def : Proc<"k6-3", [Feature3DNow]>; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 6b228b0..9e0ab82 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -702,48 +702,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } -MachineLocation -X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - - if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - -void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &O) { - // Only the target-dependent form of DBG_VALUE should get here. - // Referencing the offset and metadata as NOps-2 and NOps-1 is - // probably portable to other targets; frame pointer location is not. - unsigned NOps = MI->getNumOperands(); - assert(NOps==7); - O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - if (V.getContext().isSubprogram()) - O << DISubprogram(V.getContext()).getDisplayName() << ":"; - O << V.getName(); - O << " <- "; - // Frame address. Currently handles register +- offset only. - O << '['; - if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) - printOperand(MI, 0, O); - else - O << "undef"; - O << '+'; printOperand(MI, 3, O); - O << ']'; - O << "+"; - printOperand(MI, NOps-2, O); -} - - - //===----------------------------------------------------------------------===// // Target Registry Stuff //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index bc7496b..6eed5ce 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -67,11 +67,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { unsigned AsmVariant = 1); virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE; - - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - - virtual MachineLocation - getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; }; } // end namespace llvm diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 9eafbd5..38e2591 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, @@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // i32, i64 in the standard way CCDelegateTo<RetCC_X86Common> ]>; @@ -156,6 +166,11 @@ def RetCC_X86_32 : CallingConv<[ def RetCC_X86_64 : CallingConv<[ // HiPE uses RetCC_X86_64_HiPE CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + + // Handle explicit CC selection + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -208,10 +223,15 @@ def CC_X86_64_C : CallingConv<[ // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>>, + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, + // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, @@ -225,7 +245,11 @@ def CC_X86_64_C : CallingConv<[ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>> + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; // Calling convention used on Win64 @@ -246,6 +270,9 @@ def CC_X86_Win64_C : CallingConv<[ // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType<i64>>, @@ -340,7 +367,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 4 AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. @@ -464,6 +491,10 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, CCDelegateTo<CC_X86_32_C> @@ -489,6 +520,8 @@ def CC_X86_32 : CallingConv<[ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -528,6 +561,10 @@ def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "YMM%u", 6, 15))>; +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; //Standard C + XMM 8-15 def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, (sequence "XMM%u", 8, 15))>; @@ -535,3 +572,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, //Standard C + YMM 8-15 def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 295a577..5bc3420 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -79,8 +79,10 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, + bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, + bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -233,7 +235,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, + const X86AddressMode &AM, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -243,8 +246,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); - Val = AndResult; + TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; @@ -260,26 +263,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; break; case MVT::v4f32: - Opc = X86::MOVAPSmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - Opc = X86::MOVAPDmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - Opc = X86::MOVDQAmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DL, TII.get(Opc)), AM).addReg(Val); + DL, TII.get(Opc)), AM).addReg(ValReg); return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM) { + const X86AddressMode &AM, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); @@ -314,7 +326,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM); + return X86FastEmitStore(VT, ValReg, AM, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -688,6 +700,10 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (S->isAtomic()) return false; + unsigned SABIAlignment = + TD.getABITypeAlignment(S->getValueOperand()->getType()); + bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; + MVT VT; if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) return false; @@ -696,7 +712,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (!X86SelectAddress(I->getOperand(1), AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM); + return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -712,10 +728,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall) + CC != CallingConv::X86_FastCall && + CC != CallingConv::X86_64_SysV) return false; - if (Subtarget->isTargetWin64()) + if (Subtarget->isCallingConvWin64(CC)) return false; // Don't handle popping bytes on return for now. @@ -1376,10 +1393,37 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { // Generate the DIV/IDIV instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); - // Copy output register into result register. - unsigned ResultReg = createResultReg(TypeEntry.RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg); + // For i8 remainder, we can't reference AH directly, as we'll end + // up with bogus copies like %R9B = COPY %AH. Reference AX + // instead to prevent AH references in a REX instruction. + // + // The current assumption of the fast register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + unsigned ResultReg = 0; + if ((I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem) && + OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { + unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); + unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), SourceSuperReg).addReg(X86::AX); + + // Shift AX right by 8 bits instead of using AH. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri), + ResultSuperReg).addReg(SourceSuperReg).addImm(8); + + // Now reference the 8-bit subreg of the result. + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + /*Kill=*/true, X86::sub_8bit); + } + // Copy the result out of the physreg if we haven't already. + if (!ResultReg) { + ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg) + .addReg(OpEntry.DivRemResultReg); + } UpdateValueMap(I, ResultReg); return true; @@ -1678,9 +1722,6 @@ bool X86FastISel::FastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; - if (Subtarget->isTargetWin64()) - return false; - const Function *F = FuncInfo.Fn; if (F->isVarArg()) return false; @@ -1688,7 +1729,10 @@ bool X86FastISel::FastLowerArguments() { CallingConv::ID CC = F->getCallingConv(); if (CC != CallingConv::C) return false; - + + if (Subtarget->isCallingConvWin64(CC)) + return false; + if (!Subtarget->is64Bit()) return false; @@ -1732,8 +1776,6 @@ bool X86FastISel::FastLowerArguments() { const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++Idx) { - if (I->use_empty()) - continue; bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; @@ -1792,8 +1834,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); + bool isWin64 = Subtarget->isCallingConvWin64(CC); if (CC != CallingConv::C && CC != CallingConv::Fast && - CC != CallingConv::X86_FastCall) + CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 && + CC != CallingConv::X86_64_SysV) return false; // fastcc with -tailcallopt is intended to provide a guaranteed @@ -1807,7 +1851,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Don't know how to handle Win64 varargs yet. Nothing special needed for // x86-32. Special handling for x86-64 is implemented. - if (isVarArg && Subtarget->isTargetWin64()) + if (isVarArg && isWin64) return false; // Fast-isel doesn't know about callee-pop yet. @@ -1937,7 +1981,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { I->getParent()->getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) + if (isWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -2053,7 +2097,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { X86::EBX).addReg(Base); } - if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + if (Subtarget->is64Bit() && isVarArg && !isWin64) { // Count the number of XMM registers allocated. static const uint16_t XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, @@ -2122,7 +2166,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); - if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + if (Subtarget->is64Bit() && isVarArg && !isWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 8522c8c..48470da 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -115,9 +115,10 @@ namespace { unsigned Mask = 0; for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I - X86::FP0; - if (Reg < 8) - Mask |= 1 << Reg; + unsigned Reg = *I; + if (Reg < X86::FP0 || Reg > X86::FP6) + continue; + Mask |= 1 << (Reg - X86::FP0); } return Mask; } @@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) .addExternalSymbol("_ftol2") .addReg(X86::ST0, RegState::ImplicitKill) + .addReg(X86::ECX, RegState::ImplicitDefine) .addReg(X86::EAX, RegState::Define | RegState::Implicit) .addReg(X86::EDX, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 3061117..b994e67 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -307,7 +307,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, unsigned FramePtr) const { MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); - const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); @@ -360,7 +360,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, if (HasFP && FramePtr == Reg) continue; - unsigned DwarfReg = MRI.getDwarfRegNum(Reg, true); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset)); } } @@ -914,11 +914,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - // MSVC x64's __chkstk needs to adjust %rsp. - // FIXME: %rax preserves the offset and should be available. - if (isSPUpdateNeeded) - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, - UseLEA, TII, *RegInfo); + // MSVC x64's __chkstk does not adjust %rsp itself. + // It also does not clobber %rax so we can reuse it when adjusting %rsp. + if (isSPUpdateNeeded) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) + .addReg(StackPtr) + .addReg(X86::RAX) + .setMIFlag(MachineInstr::FrameSetup); + } if (isEAXAlive) { // Restore EAX @@ -1320,7 +1323,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area @@ -1333,7 +1336,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // } // [EBP] MFI->CreateFixedObject(-TailCallReturnAddrDelta, - (-1U*SlotSize)+TailCallReturnAddrDelta, true); + TailCallReturnAddrDelta - SlotSize, true); } if (hasFP(MF)) { diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 4ffffa1..9465420 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -141,10 +141,6 @@ namespace { /// SelectionDAG operations. /// class X86DAGToDAGISel : public SelectionDAGISel { - /// X86Lowering - This object fully describes how to lower LLVM code to an - /// X86-specific SelectionDAG. - const X86TargetLowering &X86Lowering; - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; @@ -156,7 +152,6 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), - X86Lowering(*tm.getTargetLowering()), Subtarget(&tm.getSubtarget<X86Subtarget>()), OptForSize(false) {} @@ -233,7 +228,8 @@ namespace { SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI->getPointerTy()) : + CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, + getTargetLowering()->getPointerTy()) : AM.Base_Reg; Scale = getI8Imm(AM.Scale); Index = AM.IndexReg; @@ -504,8 +500,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. - bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); - bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(getTargetLowering()); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; @@ -1556,7 +1554,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode(); + return CurDAG->getRegister(GlobalBaseReg, + getTargetLowering()->getPointerTy()).getNode(); } SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { @@ -2532,6 +2531,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Prevent use of AH in a REX instruction by referencing AX instead. // Shift it down 8 bits. + // + // The current assumption of the register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. if (HiReg == X86::AH && Subtarget->is64Bit() && !SDValue(Node, 1).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 346dfbb..9ffe29f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -58,17 +58,14 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 instruction or a -/// simple subregister reference. Idx is an index in the 128 bits we -/// want. It need not be aligned to a 128-bit bounday. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); EVT VT = Vec.getValueType(); - assert(VT.is256BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/128; + unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); @@ -76,13 +73,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); - // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR - // we can match to VEXTRACTF128. - unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) * ElemsPerChunk); // If the input is a buildvector just emit a smaller one. @@ -95,38 +91,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, VecIdx); return Result; + +} +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128 instruction or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; - EVT VT = Vec.getValueType(); - assert(VT.is128BitVector() && "Unexpected vector size!"); - EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); - // Insert the relevant 128 bits. - unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of @@ -139,6 +168,13 @@ static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); bool is64Bit = Subtarget->is64Bit(); @@ -563,10 +599,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); if (Subtarget->is64Bit()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); @@ -586,10 +618,12 @@ void X86TargetLowering::resetOperationActions() { // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit()) { + if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { + // TargetInfo::X86_64ABIBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { + // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } @@ -1000,7 +1034,7 @@ void X86TargetLowering::resetOperationActions() { setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } - if (Subtarget->hasSSE41()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -1263,6 +1297,150 @@ void X86TargetLowering::resetOperationActions() { } } + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + + addRegisterClass(MVT::v8i1, &X86::VK8RegClass); + addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v16f32, Legal); + setOperationAction(ISD::LOAD, MVT::v8f64, Legal); + setOperationAction(ISD::LOAD, MVT::v8i64, Legal); + setOperationAction(ISD::LOAD, MVT::v16i32, Legal); + setOperationAction(ISD::LOAD, MVT::v16i1, Legal); + + setOperationAction(ISD::FADD, MVT::v16f32, Legal); + setOperationAction(ISD::FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + + setOperationAction(ISD::FADD, MVT::v8f64, Legal); + setOperationAction(ISD::FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FMA, MVT::v8f64, Legal); + setOperationAction(ISD::FMA, MVT::v16f32, Legal); + setOperationAction(ISD::SDIV, MVT::v16i32, Custom); + + + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + + setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + + setOperationAction(ISD::SETCC, MVT::v16i1, Custom); + setOperationAction(ISD::SETCC, MVT::v8i1, Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8f64, Custom); + setOperationAction(ISD::SELECT, MVT::v8i64, Custom); + setOperationAction(ISD::SELECT, MVT::v16f32, Custom); + + setOperationAction(ISD::ADD, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v16i32, Legal); + + setOperationAction(ISD::SUB, MVT::v8i64, Legal); + setOperationAction(ISD::SUB, MVT::v16i32, Legal); + + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::SRL, MVT::v8i64, Custom); + setOperationAction(ISD::SRL, MVT::v16i32, Custom); + + setOperationAction(ISD::SHL, MVT::v8i64, Custom); + setOperationAction(ISD::SHL, MVT::v16i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i64, Custom); + setOperationAction(ISD::SRA, MVT::v16i32, Custom); + + setOperationAction(ISD::AND, MVT::v8i64, Legal); + setOperationAction(ISD::OR, MVT::v8i64, Legal); + setOperationAction(ISD::XOR, MVT::v8i64, Legal); + + // Custom lower several nodes. + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Extract subvector is special because the value type + // (result) is 256/128-bit but the source is 512-bit wide. + if (VT.is128BitVector() || VT.is256BitVector()) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + if (VT.getVectorElementType() == MVT::i1) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + // Do not attempt to custom lower other non-512-bit vectors + if (!VT.is512BitVector()) + continue; + + if (VT != MVT::v8i64) { + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + } + if ( EltSize >= 32) { + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } + } + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Do not attempt to promote non-256-bit vectors + if (!VT.is512BitVector()) + continue; + + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v8i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); + } + }// has AVX-512 + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -1884,13 +2062,19 @@ static bool IsTailCallConvention(CallingConv::ID CC) { CC == CallingConv::HiPE); } +/// \brief Return true if the calling convention is a C calling convention. +static bool IsCCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || + CC == CallingConv::X86_64_SysV); +} + bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); - if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; return true; @@ -1965,7 +2149,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWindows = Subtarget->isTargetWindows(); - bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); @@ -1976,9 +2160,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (IsWin64) { + if (IsWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeFormalArguments(Ins, CC_X86); @@ -2004,12 +2187,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2267,7 +2456,8 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); + MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), @@ -2279,10 +2469,10 @@ SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SelectionDAG &DAG = CLI.DAG; - SDLoc &dl = CLI.DL; - SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; - SmallVector<SDValue, 32> &OutVals = CLI.OutVals; - SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; @@ -2291,7 +2481,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); - bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); bool IsWindows = Subtarget->isTargetWindows(); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; @@ -2324,9 +2514,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (IsWin64) { + if (IsWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(Outs, CC_X86); @@ -2837,13 +3026,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { - if (!IsTailCallConvention(CalleeCC) && - CalleeCC != CallingConv::C) + if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); - const Function *CallerF = DAG.getMachineFunction().getFunction(); + const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to @@ -2853,6 +3041,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); + bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) @@ -2886,7 +3076,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. - if (Subtarget->isTargetWin64()) + if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector<CCValAssign, 16> ArgLocs; @@ -2961,9 +3151,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { + if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { @@ -3135,7 +3324,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, + -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3696,12 +3886,10 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { @@ -3735,11 +3923,10 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { @@ -3780,12 +3967,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; @@ -3815,11 +4000,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) @@ -4051,42 +4235,59 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { return true; } -/// isVEXTRACTF128Index - Return true if the specified +/// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for input to VEXTRACTF128. -bool X86::isVEXTRACTF128Index(SDNode *N) { +/// suitable for instruction that extract 128 or 256 bit vectors +static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) return false; - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getValueType(0).getSimpleVT(); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } -/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to -/// VINSERTF128. -bool X86::isVINSERTF128Index(SDNode *N) { +/// insertion of 128 or 256-bit subvectors +static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) return false; - - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getValueType(0).getSimpleVT(); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } +bool X86::isVINSERT128Index(SDNode *N) { + return isVINSERTIndex(N, 128); +} + +bool X86::isVINSERT256Index(SDNode *N) { + return isVINSERTIndex(N, 256); +} + +bool X86::isVEXTRACT128Index(SDNode *N) { + return isVEXTRACTIndex(N, 128); +} + +bool X86::isVEXTRACT256Index(SDNode *N) { + return isVEXTRACTIndex(N, 256); +} + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. @@ -4190,12 +4391,10 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } -/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// instructions. -unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { +static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + llvm_unreachable("Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); @@ -4203,16 +4402,14 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } -/// getInsertVINSERTF128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// instructions. -unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { +static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERTF128"); + llvm_unreachable("Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); @@ -4220,10 +4417,38 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { MVT VecVT = N->getValueType(0).getSimpleVT(); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } +/// getExtractVEXTRACT128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// and VINSERTI128 instructions. +unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 128); +} + +/// getExtractVEXTRACT256Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 256); +} + +/// getInsertVINSERT128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// and VINSERTI128 instructions. +unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 128); +} + +/// getInsertVINSERT256Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 256); +} + /// getShuffleCLImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. /// Handles 256-bit. @@ -5063,10 +5288,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector<int, 8> Mask; - for (unsigned i = 0; i != NumElems; ++i) - Mask.push_back(EltNo); - + SmallVector<int, 8> Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -5184,7 +5406,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getValueType().getSimpleVT(); SDLoc dl(Op); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; @@ -5240,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. - if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + + // AVX-512 has register version of the broadcast + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + Ld.getValueType().getSizeInBits() >= 32; + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && + !hasRegVer)) return SDValue(); break; } } - bool Is256 = VT.is256BitVector(); + bool IsGE256 = (VT.getSizeInBits() >= 256); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -5256,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5280,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && - (ScalarSize == 32 || (Is256 && ScalarSize == 64))) + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -5375,6 +5602,108 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { return NV; } +// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. +SDValue +X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && + "Unexpected type in LowerBUILD_VECTORvXi1!"); + + SDLoc dl(Op); + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(1, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + bool AllContants = true; + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() == ISD::UNDEF) + continue; + if (!isa<ConstantSDNode>(In)) { + AllContants = false; + break; + } + if (cast<ConstantSDNode>(In)->getZExtValue()) + Immediate |= (1ULL << idx); + } + + if (AllContants) { + SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, + DAG.getConstant(Immediate, MVT::i16)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, + DAG.getIntPtrConstant(0)); + } + + if (!isSplatVector(Op.getNode())) + llvm_unreachable("Unsupported predicate operation"); + + SDValue In = Op.getOperand(0); + SDValue EFLAGS, X86CC; + if (In.getOpcode() == ISD::SETCC) { + SDValue Op0 = In.getOperand(0); + SDValue Op1 = In.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); + bool isFP = Op1.getValueType().isFloatingPoint(); + unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + + assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); + + X86CC = DAG.getConstant(X86CCVal, MVT::i8); + EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + } else if (In.getOpcode() == X86ISD::SETCC) { + X86CC = In.getOperand(0); + EFLAGS = In.getOperand(1); + } else { + // The algorithm: + // Bit1 = In & 0x1 + // if (Bit1 != 0) + // ZF = 0 + // else + // ZF = 1 + // if (ZF == 0) + // res = allOnes ### CMOVNE -1, %res + // else + // res = allZero + MVT InVT = In.getValueType().getSimpleVT(); + SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); + EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); + X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); + } + + if (VT == MVT::v16i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i16); + SDValue Cst0 = DAG.getConstant(0, MVT::i16); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, + Cst0, Cst1, X86CC, EFLAGS); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + + if (VT == MVT::v8i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i32); + SDValue Cst0 = DAG.getConstant(0, MVT::i32); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, + Cst0, Cst1, X86CC, EFLAGS); + CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + llvm_unreachable("Unsupported predicate operation"); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -5383,6 +5712,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); + // Generate vectors for predicate vectors. + if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + return LowerBUILD_VECTORvXi1(Op, DAG); + // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd @@ -5713,19 +6046,22 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getValueType().getSimpleVT(); - assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); + assert((ResVT.is256BitVector() || + ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); + if(ResVT.is256BitVector()) + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); - return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); - // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. return LowerAVXCONCAT_VECTORS(Op, DAG); } @@ -7195,6 +7531,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); @@ -7203,17 +7540,19 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. - if (VecVT.is256BitVector()) { - SDLoc dl(Op.getNode()); - unsigned NumElems = VecVT.getVectorNumElements(); + if (VecVT.is256BitVector() || VecVT.is512BitVector()) { SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + EVT EltVT = VecVT.getVectorElementType(); + + unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); - if (IdxVal >= NumElems/2) - IdxVal -= NumElems/2; + //if (IdxVal >= NumElems/2) + // IdxVal -= NumElems/2; + IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, MVT::i32)); } @@ -7227,7 +7566,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, } MVT VT = Op.getValueType().getSimpleVT(); - SDLoc dl(Op); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); @@ -7348,19 +7686,20 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // If this is a 256-bit vector result, first extract the 128-bit vector, // insert the element into the extracted half and then place it back. - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { if (!isa<ConstantSDNode>(N2)) return SDValue(); // Get the desired 128-bit vector half. - unsigned NumElems = VT.getVectorNumElements(); unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - bool Upper = IdxVal >= NumElems/2; + unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); + DAG.getConstant(IdxIn128, MVT::i32)); // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); @@ -7393,9 +7732,10 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. + unsigned SizeFactor = OpVT.getSizeInBits()/128; EVT VT128 = EVT::getVectorVT(*Context, OpVT.getVectorElementType(), - OpVT.getVectorNumElements() / 2); + OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); @@ -7418,16 +7758,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - SDLoc dl(Op.getNode()); - SDValue Vec = Op.getNode()->getOperand(0); - SDValue Idx = Op.getNode()->getOperand(1); + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + EVT ResVT = Op.getValueType(); + EVT InVT = In.getValueType(); - if (Op.getNode()->getValueType(0).is128BitVector() && - Vec.getNode()->getValueType(0).is256BitVector() && + if (Subtarget->hasFp256()) { + if (ResVT.is128BitVector() && + (InVT.is256BitVector() || InVT.is512BitVector()) && isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Extract128BitVector(Vec, IdxVal, DAG, dl); + return Extract128BitVector(In, IdxVal, DAG, dl); + } + if (ResVT.is256BitVector() && InVT.is512BitVector() && + isa<ConstantSDNode>(Idx)) { + return Extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); @@ -7444,12 +7790,20 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).is256BitVector() && + if ((Op.getNode()->getValueType(0).is256BitVector() || + Op.getNode()->getValueType(0).is512BitVector()) && SubVec.getNode()->getValueType(0).is128BitVector() && isa<ConstantSDNode>(Idx)) { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } + + if (Op.getNode()->getValueType(0).is512BitVector() && + SubVec.getNode()->getValueType(0).is256BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + } } return SDValue(); } @@ -8100,7 +8454,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, LLVMContext *Context = DAG.getContext(); // Build some magic constants. - const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); @@ -8837,7 +9191,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { - SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); @@ -9236,6 +9590,51 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } +/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point +/// mask CMPs. +static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, + SDValue &Op1) { + unsigned SSECC; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; + } + if (Swap) + std::swap(Op0, Op1); + + return SSECC; +} + // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { @@ -9283,43 +9682,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif - unsigned SSECC; - bool Swap = false; - - // SSE Condition code mapping: - // 0 - EQ - // 1 - LT - // 2 - LE - // 3 - UNORD - // 4 - NEQ - // 5 - NLT - // 6 - NLE - // 7 - ORD - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETOEQ: - case ISD::SETEQ: SSECC = 0; break; - case ISD::SETOGT: - case ISD::SETGT: Swap = true; // Fallthrough - case ISD::SETLT: - case ISD::SETOLT: SSECC = 1; break; - case ISD::SETOGE: - case ISD::SETGE: Swap = true; // Fallthrough - case ISD::SETLE: - case ISD::SETOLE: SSECC = 2; break; - case ISD::SETUO: SSECC = 3; break; - case ISD::SETUNE: - case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; // Fallthrough - case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; // Fallthrough - case ISD::SETUGT: SSECC = 6; break; - case ISD::SETO: SSECC = 7; break; - case ISD::SETUEQ: - case ISD::SETONE: SSECC = 8; break; - } - if (Swap) - std::swap(Op0, Op1); + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { @@ -9351,8 +9714,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false; - + bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; + switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; @@ -9366,6 +9729,23 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, case ISD::SETUGE: Swap = true; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } + + // Special case: Use min/max operations for SETULE/SETUGE + MVT VET = VT.getVectorElementType(); + bool hasMinMax = + (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) + || (Subtarget->hasSSE2() && (VET == MVT::i8)); + + if (hasMinMax) { + switch (SetCCOpcode) { + default: break; + case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; + } + + if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + } + if (Swap) std::swap(Op0, Op1); @@ -9399,8 +9779,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. - const int MaskHi[] = { 1, 1, 3, 3 }; - const int MaskLo[] = { 0, 0, 2, 2 }; + static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); @@ -9427,7 +9807,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. - const int Mask[] = { 1, 0, 3, 2 }; + static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); @@ -9452,6 +9832,9 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); + + if (MinMax) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); return Result; } @@ -9560,8 +9943,30 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); + EVT VT = Op1.getValueType(); SDValue CC; + // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available. Otherwise fp cmovs get lowered into a less efficient branch + // sequence later on. + if (Cond.getOpcode() == ISD::SETCC && + ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || + (Subtarget->hasSSE1() && VT == MVT::f32)) && + VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); + int SSECC = translateX86FSETCC( + cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + + if (SSECC != 8) { + unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; + SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, + DAG.getConstant(SSECC, MVT::i8)); + SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); + SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); + return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); + } + } + if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) @@ -10889,8 +11294,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { X86CC = X86::COND_E; break; } - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, @@ -10907,8 +11311,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { else Opcode = X86ISD::PCMPESTRI; - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } @@ -11492,7 +11895,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, "Should not custom lower when pmuldq is available!"); // Extract the odd parts. - const int UnpackMask[] = { 1, -1, 3, -1 }; + static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); @@ -11506,7 +11909,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. - const int ShufMask[] = { 0, 4, 2, 6 }; + static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } @@ -11560,9 +11963,11 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); APInt SplatValue, SplatUndef; - unsigned MinSplatBits; + unsigned SplatBitSize; bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs) || + EltTy.getSizeInBits() < SplatBitSize) return SDValue(); if ((SplatValue != 0) && @@ -12706,6 +13111,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FSRL: return "X86ISD::FSRL"; @@ -12829,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; @@ -12917,6 +13324,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return NumBits1 > NumBits2; } +bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } @@ -12968,6 +13389,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool +X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + return false; + + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); @@ -14434,12 +14876,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, } else { // __chkstk(MSVCRT): does not update stack pointer. // Clobbers R10, R11 and EFLAGS. - // FIXME: RAX(allocated size) might be reused and not killed. BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) .addExternalSymbol("__chkstk") .addReg(X86::RAX, RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to subtracted from RSP. + // RAX has the offset to be subtracted from RSP. BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(X86::RAX); @@ -16299,6 +16740,38 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// \brief Returns a vector of 0s if the node in input is a vector logical +/// shift by a constant amount which is known to be bigger than or equal +/// to the vector element size in bits. +static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasInt256() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) + return SDValue(); + + SDValue Amt = N->getOperand(1); + SDLoc DL(N); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + APInt ShiftAmt = C->getAPIntValue(); + unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); + + // SSE2/AVX2 logical shifts always return a vector of 0s + // if the shift amount is bigger than or equal to + // the element size. The constant shift amount will be + // encoded as a 8-bit immediate. + if (ShiftAmt.trunc(8).uge(MaxAmount)) + return getZeroVector(VT, Subtarget, DAG, DL); + } + } + + return SDValue(); +} + /// PerformShiftCombine - Combine shifts. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16308,6 +16781,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (V.getNode()) return V; } + if (N->getOpcode() != ISD::SRA) { + // Try to fold this logical shift into a zero vector. + SDValue V = performShiftToAllZeros(N, DAG, Subtarget); + if (V.getNode()) return V; + } + return SDValue(); } @@ -17253,7 +17732,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; - EVT VT = LHS.getValueType(); + MVT VT = LHS.getValueType().getSimpleVT(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); @@ -17324,23 +17803,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != NumElts; ++i) { - int LIdx = LMask[i], RIdx = RMask[i]; - - // Ignore any UNDEF components. - if (LIdx < 0 || RIdx < 0 || - (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || - (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) - continue; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + int LIdx = LMask[i+l], RIdx = RMask[i+l]; + + // Ignore any UNDEF components. + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) + continue; - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs - unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; - int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + unsigned Src = (i/HalfLaneElts); // each lane is split between srcs + int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) + return false; + } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. @@ -17428,6 +17908,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { + // FANDN(x, 0.0) -> 0.0 + // FANDN(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + return SDValue(); +} + static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { @@ -17882,6 +18375,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -18423,7 +18917,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass*> X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { @@ -18490,7 +18984,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget->hasSSE1()) break; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: @@ -18515,6 +19009,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + return std::make_pair(0U, &X86::VR512RegClass); } break; } @@ -18625,7 +19124,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } else if (Res.second == &X86::FR32RegClass || Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass) { + Res.second == &X86::VR128RegClass || + Res.second == &X86::VR256RegClass || + Res.second == &X86::FR32XRegClass || + Res.second == &X86::FR64XRegClass || + Res.second == &X86::VR128XRegClass || + Res.second == &X86::VR256XRegClass || + Res.second == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -18639,6 +19144,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; + else if (X86::VR512RegClass.hasType(VT)) + Res.second = &X86::VR512RegClass; } return Res; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index c0e1015..2703274 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -53,6 +53,10 @@ namespace llvm { /// to X86::XORPS or X86::XORPD. FXOR, + /// FAND - Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + /// FSRL - Bitwise logical right shift of floating point values. These /// corresponds to X86::PSRLDQ. FSRL, @@ -290,6 +294,10 @@ namespace llvm { // TESTP - Vector packed fp sign bitwise comparisons TESTP, + // OR/AND test for masks + KORTEST, + KTEST, + // Several flavors of instructions with vector shuffle behaviors. PALIGNR, PSHUFD, @@ -313,6 +321,8 @@ namespace llvm { VPERMI, VPERM2X128, VBROADCAST, + // masked broadcast + VBROADCASTM, // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, @@ -434,25 +444,45 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isVEXTRACTF128Index - Return true if the specified + /// isVEXTRACT128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. + bool isVEXTRACT128Index(SDNode *N); + + /// isVINSERT128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128, VINSERTI128 instructions. + bool isVINSERT128Index(SDNode *N); + + /// isVEXTRACT256Index - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is - /// suitable for input to VEXTRACTF128. - bool isVEXTRACTF128Index(SDNode *N); + /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. + bool isVEXTRACT256Index(SDNode *N); - /// isVINSERTF128Index - Return true if the specified + /// isVINSERT256Index - Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is - /// suitable for input to VINSERTF128. - bool isVINSERTF128Index(SDNode *N); + /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. + bool isVINSERT256Index(SDNode *N); - /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// getExtractVEXTRACT128Immediate - Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index - /// with VEXTRACTF128 instructions. - unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + /// with VEXTRACTF128, VEXTRACTI128 instructions. + unsigned getExtractVEXTRACT128Immediate(SDNode *N); - /// getInsertVINSERTF128Immediate - Return the appropriate + /// getInsertVINSERT128Immediate - Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index - /// with VINSERTF128 instructions. - unsigned getInsertVINSERTF128Immediate(SDNode *N); + /// with VINSERTF128, VINSERT128 instructions. + unsigned getInsertVINSERT128Immediate(SDNode *N); + + /// getExtractVEXTRACT256Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. + unsigned getExtractVEXTRACT256Immediate(SDNode *N); + + /// getInsertVINSERT256Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF64x4, VINSERTI64x4 instructions. + unsigned getInsertVINSERT256Immediate(SDNode *N); /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. @@ -610,7 +640,7 @@ namespace llvm { /// error, this returns a register number of 0. std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const; + MVT VT) const; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. @@ -634,6 +664,8 @@ namespace llvm { virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; virtual bool isTruncateFree(EVT VT1, EVT VT2) const; + virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const; + /// isZExtFree - Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in @@ -646,11 +678,11 @@ namespace llvm { virtual bool isZExtFree(EVT VT1, EVT VT2) const; virtual bool isZExtFree(SDValue Val, EVT VT2) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; /// isNarrowingProfitable - Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow @@ -802,6 +834,7 @@ namespace llvm { SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; @@ -821,7 +854,9 @@ namespace llvm { SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td new file mode 100644 index 0000000..8abae14 --- /dev/null +++ b/lib/Target/X86/X86InstrAVX512.td @@ -0,0 +1,715 @@ +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX512] in { + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + + def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion + def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - VECTOR INSERT +// +// -- 32x8 form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; +} + +// -- 64x4 fp form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in { +def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} +// -- 32x4 integer form -- +let neverHasSideEffects = 1 in { +def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i128mem:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +} + +let neverHasSideEffects = 1 in { +// -- 64x4 form -- +def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), + (bc_v8i32 (loadv4i64 addr:$src2)), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +// vinsertps - insert f32 to XMM +def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + EVEX_4V; +def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 VECTOR EXTRACT +//--- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +// -- 32x4 form -- +def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), + (ins f256mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +let neverHasSideEffects = 1 in { +// -- 32x4 form -- +def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), + (ins i256mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v4f32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), + (v4i32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v2f64 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v2i64 (VEXTRACTI32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + + +def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v8f32 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), + (v8i32 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v4f64 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v4i64 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +// A 256-bit subvector extract from the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; +def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; +def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; +def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; + +// zmm -> xmm +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; + + +// A 128-bit subvector insert to the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; + +def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; + +// vextractps - extract 32 bits from XMM +def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), + (ins VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + EVEX; + +def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), + (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), + addr:$dst)]>, EVEX; + +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- +multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DestRC, + RegisterClass SrcRC, X86MemOperand x86memop> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX; +} +let ExeDomain = SSEPackedSingle in { + defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, + VR128X, f32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +} + +let ExeDomain = SSEPackedDouble in { + defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512, + VR128X, f64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSZrm addr:$src)>; +def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDZrm addr:$src)>; + +multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, + RegisterClass SrcRC, RegisterClass KRC> { + def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V512; + def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), + (ins KRC:$mask, SrcRC:$src), + !strconcat(OpcodeStr, + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + []>, EVEX, EVEX_V512, EVEX_KZ; +} + +defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; +defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, + VEX_W; + +def : Pat <(v16i32 (X86vzext VK16WM:$mask)), + (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + +def : Pat <(v8i64 (X86vzext VK8WM:$mask)), + (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + +def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), + (VPBROADCASTDrZrr GR32:$src)>; +def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), + (VPBROADCASTQrZrr GR64:$src)>; + +multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, + RegisterClass KRC> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; + def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, + VR128X:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, + EVEX, EVEX_KZ; + def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; + def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, + (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; +} + +defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, + loadi32, VR512, v16i32, v4i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, + loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VT1>; + +def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), + (VBROADCASTSSZrr VR128X:$src)>; +def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), + (VBROADCASTSDZrr VR128X:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), + (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; +def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), + (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), + (EXTRACT_SUBREG + (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + addr:$src)), sub_ymm)>; +} +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DstRC, RegisterClass KRC, + ValueType OpVT, ValueType SrcVT> { +def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, + VK16, v16i32, v16i1>, EVEX_V512; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, + VK8, v8i64, v8i1>, EVEX_V512, VEX_W; + +// Mask register copy, including +// - copy between mask registers +// - load/store mask registers +// - copy from GPR to mask register and vice versa +// +multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, + string OpcodeStr, RegisterClass KRC, + ValueType vt, X86MemOperand x86memop> { + let neverHasSideEffects = 1 in { + def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + let mayLoad = 1 in + def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vt (load addr:$src)))]>; + let mayStore = 1 in + def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, + string OpcodeStr, + RegisterClass KRC, RegisterClass GRC> { + let neverHasSideEffects = 1 in { + def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +let Predicates = [HasAVX512] in { + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, + VEX, TB; + defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, + VEX, TB; +} + +let Predicates = [HasAVX512] in { + // GR16 from/to 16-bit mask + def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; + def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; + + // Store kreg in memory + def : Pat<(store (v16i1 VK16:$src), addr:$dst), + (KMOVWmk addr:$dst, VK16:$src)>; + + def : Pat<(store (v8i1 VK8:$src), addr:$dst), + (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>; +} +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + // GR from/to 8-bit mask without native support + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VK8)>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit)>; +} + +// Mask unary operation +// - KNOT +multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (OpNode KRC:$src))]>; +} + +multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; + +def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; + +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +def : Pat<(not VK8:$src), + (COPY_TO_REGCLASS + (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; + +// Mask binary operation +// - KADD, KAND, KANDN, KOR, KXNOR, KXOR +multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX_4V, VEX_L, TB; +} + +def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; +def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; + +let isCommutable = 1 in { + defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>; + defm KAND : avx512_mask_binop_w<0x41, "kand", and>; + let isCommutable = 0 in + defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; + defm KOR : avx512_mask_binop_w<0x45, "kor", or>; + defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>; + defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; +} + +multiclass avx512_mask_binop_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK16:$src1, VK16:$src2), + (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>; +} + +defm : avx512_mask_binop_int<"kadd", "KADD">; +defm : avx512_mask_binop_int<"kand", "KAND">; +defm : avx512_mask_binop_int<"kandn", "KANDN">; +defm : avx512_mask_binop_int<"kor", "KOR">; +defm : avx512_mask_binop_int<"kxnor", "KXNOR">; +defm : avx512_mask_binop_int<"kxor", "KXOR">; +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { + let Predicates = [HasAVX512] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; +} + +defm : avx512_binop_pat<and, KANDWrr>; +defm : avx512_binop_pat<andn, KANDNWrr>; +defm : avx512_binop_pat<or, KORWrr>; +defm : avx512_binop_pat<xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, KXORWrr>; + +// Mask unpacking +multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, + RegisterClass KRC1, RegisterClass KRC2> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +} + +multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { + defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>, + VEX_4V, VEX_L, OpSize, TB; +} + +defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; + +multiclass avx512_mask_unpck_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK8:$src1, VK8:$src2), + (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>; +} + +defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +// Mask bit testing +multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512], Defs = [EFLAGS] in + def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>; + +// Mask shift +multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512] in + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), + !strconcat(OpcodeStr, + "\t{$imm, $src, $dst|$dst, $src, $imm}"), + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; +} + +multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, + SDNode OpNode> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, OpSize, TA, VEX_W; +} + +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>; + +// Mask setting all 0s or 1s +multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { + let Predicates = [HasAVX512] in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + [(set KRC:$dst, (VT Val))]>; +} + +multiclass avx512_mask_setop_w<PatFrag Val> { + defm B : avx512_mask_setop<VK8, v8i1, Val>; + defm W : avx512_mask_setop<VK16, v16i1, Val>; +} + +defm KSET0 : avx512_mask_setop_w<immAllZerosV>; +defm KSET1 : avx512_mask_setop_w<immAllOnesV>; + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; +} +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; + +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), + (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index fa2b2d8..9ce02ba 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1041,13 +1041,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; + "{$src, %rax|rax, $src}">; } /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is @@ -1112,13 +1112,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // Uses = [EFLAGS], Defs = [EFLAGS] def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1179,13 +1179,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; + "{$src, %rax|rax, $src}">; } @@ -1253,13 +1253,13 @@ let isCompare = 1 in { } // Defs = [EFLAGS] def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|RAX, $src}">; + "{$src, %rax|rax, $src}">; } // isCompare //===----------------------------------------------------------------------===// @@ -1302,12 +1302,12 @@ let neverHasSideEffects = 1 in { let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>; let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>; } } @@ -1336,7 +1336,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize; - + def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; @@ -1361,7 +1361,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; - + def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 8a7ee7d..8969946 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), // The MSVC runtime contains an _ftol2 routine for converting floating-point // to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. No -// other registers (aside from flags) are touched. +// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is +// used as a temporary register. No other registers (aside from flags) are +// touched. // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 // variant is unnecessary. -let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { +let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), "# win32 fptoui", [(X86WinFTOL RFP32:$src)]>, diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 2224a08..7c37888 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm> // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; -def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">; def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; -def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">; def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; -def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; -def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">; def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; -def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">; def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; -def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; @@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovb\t{$op, %st(0)|st(0), $op}">, DA; def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA; def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmove\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmove\t{$op, %st(0)|st(0), $op}">, DA; def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA; + "fcmovu\t{$op, %st(0)|st(0), $op}">, DA; def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovne\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB; } // Predicates = [HasCMov] // Floating point loads & stores. @@ -578,7 +578,7 @@ def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), let SchedRW = [WriteALU] in { let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags - (outs), (ins), "fnstsw %ax", + (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF; def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 1432414..64018b3 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -96,6 +96,20 @@ def SSEPackedSingle : Domain<1>; def SSEPackedDouble : Domain<2>; def SSEPackedInt : Domain<3>; +// Class specifying the vector form of the decompressed +// displacement of 8-bit. +class CD8VForm<bits<3> val> { + bits<3> Value = val; +} +def CD8VF : CD8VForm<0>; // v := VL +def CD8VH : CD8VForm<1>; // v := VL/2 +def CD8VQ : CD8VForm<2>; // v := VL/4 +def CD8VO : CD8VForm<3>; // v := VL/8 +def CD8VT1 : CD8VForm<4>; // v := 1 +def CD8VT2 : CD8VForm<5>; // v := 2 +def CD8VT4 : CD8VForm<6>; // v := 4 +def CD8VT8 : CD8VForm<7>; // v := 8 + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } @@ -132,6 +146,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; } class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } +class EVEX : VEX { bit hasEVEXPrefix = 1; } +class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; } +class EVEX_K { bit hasEVEX_K = 1; } +class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } +class EVEX_B { bit hasEVEX_B = 1; } +class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_CD8<int esize, CD8VForm form> { + bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00, + !if(!eq(esize, 16), 0b01, + !if(!eq(esize, 32), 0b10, + !if(!eq(esize, 64), 0b11, ?)))); + bits<3> EVEX_CD8V = form.Value; +} class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { bit hasXOP_Prefix = 1; } @@ -177,6 +204,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // to be encoded in a immediate field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit hasEVEXPrefix = 0; // Does this inst require EVEX form? + bit hasEVEX_K = 0; // Does this inst require masking? + bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? + bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? + bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? + bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size. + bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? @@ -200,9 +234,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{37} = hasVEX_i8ImmReg; let TSFlags{38} = hasVEX_L; let TSFlags{39} = ignoresVEX_L; - let TSFlags{40} = has3DNow0F0FOpcode; - let TSFlags{41} = hasMemOp4Prefix; - let TSFlags{42} = hasXOP_Prefix; + let TSFlags{40} = hasEVEXPrefix; + let TSFlags{41} = hasEVEX_K; + let TSFlags{42} = hasEVEX_Z; + let TSFlags{43} = hasEVEX_L2; + let TSFlags{44} = hasEVEX_B; + let TSFlags{46-45} = EVEX_CD8E; + let TSFlags{49-47} = EVEX_CD8V; + let TSFlags{50} = has3DNow0F0FOpcode; + let TSFlags{51} = hasMemOp4Prefix; + let TSFlags{52} = hasXOP_Prefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -553,6 +594,74 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasAVX2]>; + +// AVX-512 Instruction Templates: +// Instructions introduced in AVX-512 (no SSE equivalent forms) +// +// AVX5128I - AVX-512 instructions with T8 and OpSize prefix. +// AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8. +// AVX512PDI - AVX-512 instructions with TB, OpSize, double packed. +// AVX512PSI - AVX-512 instructions with TB, single packed. +// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes. +// AVX512XSI - AVX-512 instructions with XS prefix, generic domain. +// AVX512BI - AVX-512 instructions with TB, OpSize, int packed domain. +// AVX512SI - AVX-512 scalar instructions with TB and OpSize prefixes. + +class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX512]>; +class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, + Requires<[HasAVX512]>; +class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, + Requires<[HasAVX512]>; +class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD, + Requires<[HasAVX512]>; +class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX512]>; +class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, + Requires<[HasAVX512]>; +class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, + OpSize, Requires<[HasAVX512]>; +class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, + Requires<[HasAVX512]>; +class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8, + OpSize, EVEX_4V, Requires<[HasAVX512]>; + // AES Instruction Templates: // // AES8I @@ -628,6 +737,13 @@ class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } +class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 2a72fb6..0b51521 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -47,6 +47,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; @@ -136,6 +138,8 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, @@ -153,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; + def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; @@ -192,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; @@ -405,28 +412,54 @@ def BYTE_imm : SDNodeXForm<imm, [{ return getI32Imm(N->getZExtValue() >> 3); }]>; -// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index -// to VEXTRACTF128 imm. -def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index +// to VEXTRACTF128/VEXTRACTI128 imm. +def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT128Immediate(N)); +}]>; + +// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to +// VINSERTF128/VINSERTI128 imm. +def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT128Immediate(N)); }]>; -// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to -// VINSERTF128 imm. -def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index +// to VEXTRACTF64x4 imm. +def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT256Immediate(N)); }]>; -def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), +// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to +// VINSERTF64x4 imm. +def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT256Immediate(N)); +}]>; + +def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACT128Index(N); +}], EXTRACT_get_vextract128_imm>; + +def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERT128Index(N); +}], INSERT_get_vinsert128_imm>; + + +def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, node:$index), [{ - return X86::isVEXTRACTF128Index(N); -}], EXTRACT_get_vextractf128_imm>; + return X86::isVEXTRACT256Index(N); +}], EXTRACT_get_vextract256_imm>; -def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, +def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), (insert_subvector node:$bigvec, node:$smallvec, node:$index), [{ - return X86::isVINSERTF128Index(N); -}], INSERT_get_vinsertf128_imm>; + return X86::isVINSERT256Index(N); +}], INSERT_get_vinsert256_imm>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index df7b721..0443a93 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3733,19 +3733,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -MachineInstr* -X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { - X86AddressMode AM; - AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = FrameIx; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE)); - addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr); - return &*MIB; -} - static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, const SmallVectorImpl<MachineOperand> &MOs, MachineInstr *MI, @@ -4660,6 +4647,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!TM.getSubtarget<X86Subtarget>().hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + switch(Second->getOpcode()) { + default: + return false; + case X86::JE_4: + case X86::JNE_4: + case X86::JL_4: + case X86::JLE_4: + case X86::JG_4: + case X86::JGE_4: + FuseKind = FuseInc; + break; + case X86::JB_4: + case X86::JBE_4: + case X86::JA_4: + case X86::JAE_4: + FuseKind = FuseCmp; + break; + case X86::JS_4: + case X86::JNS_4: + case X86::JP_4: + case X86::JNP_4: + case X86::JO_4: + case X86::JNO_4: + FuseKind = FuseTest; + break; + } + switch (First->getOpcode()) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64_16r: + case X86::INC64_32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64_16r: + case X86::DEC64_32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + } +} bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 332874f..a0d1ba7 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -275,12 +275,6 @@ public: virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; - virtual - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const; - /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the @@ -345,6 +339,9 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const; + virtual bool shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const LLVM_OVERRIDE; + virtual void getNoopForMachoTarget(MCInst &NopInst) const; virtual diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 817bd6c..0960a2a 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -317,6 +317,16 @@ def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; let PredicateMethod = "isMemVY64"; } +def X86MemVZ64Operand : AsmOperandClass { + let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64"; +} +def X86MemVZ32Operand : AsmOperandClass { + let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32"; +} +def X86Mem512AsmOperand : AsmOperandClass { + let Name = "Mem512"; let PredicateMethod = "isMem512"; +} + def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; @@ -345,6 +355,8 @@ def i128mem : X86MemOperand<"printi128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } def i256mem : X86MemOperand<"printi256mem"> { let ParserMatchClass = X86Mem256AsmOperand; } +def i512mem : X86MemOperand<"printi512mem"> { + let ParserMatchClass = X86Mem512AsmOperand; } def f32mem : X86MemOperand<"printf32mem"> { let ParserMatchClass = X86Mem32AsmOperand; } def f64mem : X86MemOperand<"printf64mem"> { @@ -355,6 +367,12 @@ def f128mem : X86MemOperand<"printf128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } def f256mem : X86MemOperand<"printf256mem">{ let ParserMatchClass = X86Mem256AsmOperand; } +def f512mem : X86MemOperand<"printf512mem">{ + let ParserMatchClass = X86Mem512AsmOperand; } +def v512mem : Operand<iPTR> { + let PrintMethod = "printf512mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86Mem512AsmOperand; } // Gather mem operands def vx32mem : X86MemOperand<"printi32mem">{ @@ -369,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{ def vy64mem : X86MemOperand<"printi64mem">{ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); let ParserMatchClass = X86MemVY64Operand; } +def vy64xmem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm); + let ParserMatchClass = X86MemVY64Operand; } +def vz32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ32Operand; } +def vz64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ64Operand; } } // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of @@ -590,11 +617,19 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasEMI : Predicate<"Subtarget->hasERI()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; +def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; @@ -803,11 +838,11 @@ def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG>, OpSize; -def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [], +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], IIC_POP_MEM>, OpSize; def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; -def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [], +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>; def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize; @@ -851,7 +886,7 @@ def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; -def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [], +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], IIC_POP_MEM>; } // mayLoad, SchedRW let mayStore = 1, SchedRW = [WriteStore] in { @@ -1041,40 +1076,52 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), /// 32-bit offset from the PC. These are only valid in x86-32 mode. let SchedRW = [WriteALU] in { def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>, + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>, + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>, + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>, + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; } -// FIXME: These definitions are utterly broken -// Just leave them commented out for now because they're useless outside -// of the large code model, and most compilers won't generate the instructions -// in question. -/* -def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -*/ - +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, + Requires<[In64BitMode]>; + +def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, + Requires<[In64BitMode]>; +def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, + Requires<[In64BitMode]>; +def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, + Requires<[In64BitMode]>; let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), @@ -1407,17 +1454,17 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), // Swap between EAX and other registers. def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), - "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; + "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize; def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In32BitMode]>; // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In64BitMode]>; def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), - "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>; + "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>; } // SchedRW let SchedRW = [WriteALU] in { @@ -1814,6 +1861,7 @@ include "X86InstrXOP.td" // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" +include "X86InstrAVX512.td" include "X86InstrMMX.td" include "X86Instr3DNow.td" @@ -1921,29 +1969,31 @@ def : MnemonicAlias<"fucomip", "fucompi", "att">; def : MnemonicAlias<"fwait", "wait", "att">; -class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond> +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, + string VariantName> : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), - !strconcat(Prefix, NewCond, Suffix)>; + !strconcat(Prefix, NewCond, Suffix), VariantName>; /// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of /// MnemonicAlias's that canonicalize the condition code in a mnemonic, for /// example "setz" -> "sete". -multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> { - def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb - def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete - def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe - def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae - def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae - def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle - def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge - def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne - def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp - def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp - - def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb - def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta - def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl - def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix, + string V = ""> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg } // Aliases for set<CC> @@ -1951,9 +2001,11 @@ defm : IntegerCondCodeMnemonicAlias<"set", "">; // Aliases for j<CC> defm : IntegerCondCodeMnemonicAlias<"j", "">; // Aliases for cmov<CC>{w,l,q} -defm : IntegerCondCodeMnemonicAlias<"cmov", "w">; -defm : IntegerCondCodeMnemonicAlias<"cmov", "l">; -defm : IntegerCondCodeMnemonicAlias<"cmov", "q">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; +// No size suffix for intel-style asm. +defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// @@ -1965,75 +2017,83 @@ def : InstAlias<"aad", (AAD8i8 10)>; def : InstAlias<"aam", (AAM8i8 10)>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. -def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; +// Likewise for btc/btr/bts. +def : InstAlias<"bt {$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btc {$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btr {$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"bts {$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. -def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; -def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; -def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>; -def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>; +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // div and idiv aliases for explicit A register. -def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>; -def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; -def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; -def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; -def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; -def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; -def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; -def : InstAlias<"fxch", (XCH_F ST1)>; -def : InstAlias<"fcom", (COM_FST0r ST1)>; -def : InstAlias<"fcomp", (COMP_FST0r ST1)>; -def : InstAlias<"fcomi", (COM_FIr ST1)>; -def : InstAlias<"fcompi", (COM_FIPr ST1)>; -def : InstAlias<"fucom", (UCOM_Fr ST1)>; -def : InstAlias<"fucomp", (UCOM_FPr ST1)>; -def : InstAlias<"fucomi", (UCOM_FIr ST1)>; -def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; // Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), (Inst RST:$op), EmitAlias>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; -defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; -defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; defm : FpUnaryAlias<"fmul", MUL_FST0r>; defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; defm : FpUnaryAlias<"fdiv", DIV_FST0r>; -defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; -defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; @@ -2043,16 +2103,16 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; -def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; -def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; -def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; -def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; -def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; // We accept "fnstsw %eax" even though it only writes %ax. -def : InstAlias<"fnstsw %eax", (FNSTSW16r)>; -def : InstAlias<"fnstsw %al" , (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>; def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but @@ -2071,12 +2131,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; // inb %dx -> inb %al, %dx -def : InstAlias<"inb %dx", (IN8rr)>; -def : InstAlias<"inw %dx", (IN16rr)>; -def : InstAlias<"inl %dx", (IN32rr)>; -def : InstAlias<"inb $port", (IN8ri i8imm:$port)>; -def : InstAlias<"inw $port", (IN16ri i8imm:$port)>; -def : InstAlias<"inl $port", (IN32ri i8imm:$port)>; +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp @@ -2104,7 +2164,7 @@ def : InstAlias<"movq $src, $dst", // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) -def : InstAlias<"movsd", (MOVSD)>; +def : InstAlias<"movsd", (MOVSD), 0>; // movsx aliases def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; @@ -2125,12 +2185,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx -def : InstAlias<"outb %dx", (OUT8rr)>; -def : InstAlias<"outw %dx", (OUT16rr)>; -def : InstAlias<"outl %dx", (OUT32rr)>; -def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>; -def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>; -def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2138,19 +2198,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; // shld/shrd op,op -> shld op, op, CL -def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>; -def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>; - -def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>; -def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>; +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; /* FIXME: This is disabled because the asm matcher is currently incapable of * matching a fixed immediate like $1. @@ -2181,19 +2241,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. -def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. -def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. -def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; -def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>; +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 07314a0..cb12956 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -189,13 +189,14 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { - def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2), - asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], - NoItinerary, d>; - def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), - (ins DstRC:$src1, x86memop:$src2), asm, - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], - NoItinerary, d>; + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), + (ins DstRC:$src1, SrcRC:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + NoItinerary, d>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + NoItinerary, d>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 79b1ca3..a86006a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2843,8 +2843,8 @@ defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, SSE_BIT_ITINS_P>; -let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, +let isCommutable = 0 in + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, SSE_BIT_ITINS_P>; /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops @@ -5477,12 +5477,12 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", TB, Requires<[HasSSE3]>; } // SchedRW -def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; -def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; -def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, Requires<[In32BitMode]>; -def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// @@ -6139,11 +6139,11 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { } let ExeDomain = SSEPackedSingle in { - let Predicates = [HasAVX] in { + let Predicates = [UseAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), - "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX; } defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; @@ -7016,17 +7016,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, int_x86_sse41_pblendvb>; // Aliases with the implicit xmm0 argument -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; let Predicates = [UseSSE41] in { @@ -7266,62 +7266,62 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { let Constraints = "$src1 = $dst" in { def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_8 GR32:$src1, (load addr:$src2)))]>; def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i16mem:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", + "crc32{w}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_16 GR32:$src1, (load addr:$src2)))]>, OpSize; def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR16:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", + "crc32{w}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, OpSize; def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", + "crc32{l}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_32 GR32:$src1, (load addr:$src2)))]>; def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", + "crc32{l}\t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_8 GR64:$src1, (load addr:$src2)))]>, REX_W; def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", + "crc32{b}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, REX_W; def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", + "crc32{q}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_64 GR64:$src1, (load addr:$src2)))]>, REX_W; def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", + "crc32{q}\t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, REX_W; @@ -7586,62 +7586,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX] in { -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (bc_v4i32 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (bc_v16i8 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (bc_v8i16 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -7661,59 +7661,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), // AVX1 patterns let Predicates = [HasAVX] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4f32 (VEXTRACTF128rr (v8f32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2f64 (VEXTRACTF128rr (v4f64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), +def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTF128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTF128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTF128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTF128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -8191,42 +8191,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX2] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (bc_v4i32 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (bc_v16i8 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (bc_v8i16 (memopv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -8245,39 +8245,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), VEX, VEX_L; let Predicates = [HasAVX2] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTI128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTI128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTI128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTI128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td index 757dcd0..0191c01 100644 --- a/lib/Target/X86/X86InstrSVM.td +++ b/lib/Target/X86/X86InstrSVM.td @@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; // 0F 01 DE let Uses = [EAX] in -def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB; +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; // 0F 01 D8 let Uses = [EAX] in def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DA let Uses = [EAX] in def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DB let Uses = [EAX] in def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DF let Uses = [EAX, ECX] in def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>; + "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX, ECX] in def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>; + "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index 89c1a689..1937770 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -18,16 +18,16 @@ let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize; def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>; def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] @@ -70,17 +70,17 @@ let SchedRW = [WriteShiftLd, WriteRMW] in { // using CL? let Uses = [CL] in { def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -124,16 +124,16 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize; def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>; def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } @@ -171,17 +171,17 @@ def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -224,19 +224,19 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (sra GR8:$src1, CL))], IIC_SR>; def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (sra GR16:$src1, CL))], IIC_SR>, OpSize; def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (sra GR32:$src1, CL))], IIC_SR>; def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (sra GR64:$src1, CL))], IIC_SR>; } @@ -283,19 +283,19 @@ def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -349,7 +349,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize; @@ -357,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>; @@ -365,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -374,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -383,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize; @@ -391,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>; @@ -399,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; @@ -407,7 +407,7 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } // Constraints = "$src = $dst" @@ -448,22 +448,22 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), let Uses = [CL] in { def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } } // SchedRW } // hasSideEffects = 0 @@ -472,16 +472,16 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { // FIXME: provide shorter instructions when imm8 == 1 let Uses = [CL] in { def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize; def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>; def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), - "rol{q}\t{%cl, $dst|$dst, CL}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } @@ -525,19 +525,19 @@ def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), - "rol{q}\t{%cl, $dst|$dst, %cl}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -582,16 +582,16 @@ def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize; def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>; def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } @@ -635,19 +635,19 @@ def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -699,35 +699,35 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; @@ -782,29 +782,29 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index bab3cdd..2196dc3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -77,43 +77,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, let SchedRW = [WriteSystem] in { let Defs = [AL], Uses = [DX] in def IN8rr : I<0xEC, RawFrm, (outs), (ins), - "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>; + "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>; let Defs = [AX], Uses = [DX] in def IN16rr : I<0xED, RawFrm, (outs), (ins), - "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize; + "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize; let Defs = [EAX], Uses = [DX] in def IN32rr : I<0xED, RawFrm, (outs), (ins), - "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>; + "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>; let Defs = [AL] in def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), - "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>; + "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize; + "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize; let Defs = [EAX] in def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>; + "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>; let Uses = [DX, AL] in def OUT8rr : I<0xEE, RawFrm, (outs), (ins), - "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>; + "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>; let Uses = [DX, AX] in def OUT16rr : I<0xEF, RawFrm, (outs), (ins), - "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize; + "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize; let Uses = [DX, EAX] in def OUT32rr : I<0xEF, RawFrm, (outs), (ins), - "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>; + "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>; let Uses = [AL] in def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), - "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>; + "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize; + "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize; let Uses = [EAX] in def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>; + "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>; def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>; def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize; @@ -248,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), - "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), - "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; + "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), - "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), - "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), - "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), - "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHES16 : I<0x06, RawFrm, (outs), (ins), - "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHES32 : I<0x06, RawFrm, (outs), (ins), - "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), - "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), - "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), - "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), - "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), - "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB; def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), - "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB; // No "pop cs" instruction. def POPSS16 : I<0x17, RawFrm, (outs), (ins), - "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>, OpSize, Requires<[In32BitMode]>; def POPSS32 : I<0x17, RawFrm, (outs), (ins), - "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>, Requires<[In32BitMode]>; def POPDS16 : I<0x1F, RawFrm, (outs), (ins), - "pop{w}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{w}\t{%ds|ds}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPDS32 : I<0x1F, RawFrm, (outs), (ins), - "pop{l}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{l}\t{%ds|ds}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPES16 : I<0x07, RawFrm, (outs), (ins), - "pop{w}\t{%es|ES}", [], IIC_POP_SR>, + "pop{w}\t{%es|es}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPES32 : I<0x07, RawFrm, (outs), (ins), - "pop{l}\t{%es|ES}", [], IIC_POP_SR>, + "pop{l}\t{%es|es}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPFS16 : I<0xa1, RawFrm, (outs), (ins), - "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB; def POPFS32 : I<0xa1, RawFrm, (outs), (ins), - "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPFS64 : I<0xa1, RawFrm, (outs), (ins), - "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB; def POPGS16 : I<0xa9, RawFrm, (outs), (ins), - "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB; def POPGS32 : I<0xa9, RawFrm, (outs), (ins), - "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPGS64 : I<0xa9, RawFrm, (outs), (ins), - "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB; def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 363a190..59a6f1e 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -37,3 +37,10 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins), def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + +// HLE prefixes + +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>; + +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>; + diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index a453245..c7c00b5 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -254,6 +254,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { Inst.addOperand(Saved); } +/// \brief If a movsx instruction has a shorter encoding for the used register +/// simplify the instruction to use it instead. +static void SimplifyMOVSX(MCInst &Inst) { + unsigned NewOpcode = 0; + unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg(); + switch (Inst.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw + if (Op0 == X86::AX && Op1 == X86::AL) + NewOpcode = X86::CBW; + break; + case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl + if (Op0 == X86::EAX && Op1 == X86::AX) + NewOpcode = X86::CWDE; + break; + case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq + if (Op0 == X86::RAX && Op1 == X86::EAX) + NewOpcode = X86::CDQE; + break; + } + + if (NewOpcode != 0) { + Inst = MCInst(); + Inst.setOpcode(NewOpcode); + } +} + /// \brief Simplify things like MOV32rm to MOV32o32a. static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, unsigned Opcode) { @@ -557,6 +585,13 @@ ReSimplify: case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + // Try to shrink some forms of movsx. + case X86::MOVSX16rr8: + case X86::MOVSX32rr16: + case X86::MOVSX64rr32: + SimplifyMOVSX(OutMI); + break; + case X86::MORESTACK_RET: OutMI.setOpcode(X86::RET); break; @@ -654,13 +689,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(Mang, *MF, *this); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - std::string TmpStr; - raw_string_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; + llvm_unreachable("Should be handled target independently"); // Emit nothing here but a comment if we can. case X86::Int_MemBarrier: diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index eacae2c..0923310 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -241,6 +241,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::Intel_OCL_BI: { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) @@ -275,8 +280,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX512) + return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; + if (Is64Bit && HasAVX512) + return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (IsWin64 && HasAVX) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (Is64Bit && HasAVX) @@ -344,14 +354,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::GS); // Mark the floating point stack registers as reserved. - Reserved.set(X86::ST0); - Reserved.set(X86::ST1); - Reserved.set(X86::ST2); - Reserved.set(X86::ST3); - Reserved.set(X86::ST4); - Reserved.set(X86::ST5); - Reserved.set(X86::ST6); - Reserved.set(X86::ST7); + for (unsigned n = 0; n != 8; ++n) + Reserved.set(X86::ST0 + n); // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { @@ -364,19 +368,17 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... - static const uint16_t GPR64[] = { - X86::R8, X86::R9, X86::R10, X86::R11, - X86::R12, X86::R13, X86::R14, X86::R15 - }; - for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI) + for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); // XMM8, XMM9, ... - static const uint16_t XMMReg[] = { - X86::XMM8, X86::XMM9, X86::XMM10, X86::XMM11, - X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15 - }; - for (MCRegAliasIterator AI(XMMReg[n], this, true); AI.isValid(); ++AI) + for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } + if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) { + for (unsigned n = 16; n != 32; ++n) { + for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } @@ -409,10 +411,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + return false; + const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - if (!MF.getTarget().Options.RealignStack) - return false; // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. @@ -690,4 +693,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } } + +unsigned get512BitSuperRegister(unsigned Reg) { + if (Reg >= X86::XMM0 && Reg <= X86::XMM31) + return X86::ZMM0 + (Reg - X86::XMM0); + if (Reg >= X86::YMM0 && Reg <= X86::YMM31) + return X86::ZMM0 + (Reg - X86::YMM0); + if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) + return Reg; + llvm_unreachable("Unexpected SIMD register"); +} + } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 6a1b328..fb17682 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -137,6 +137,9 @@ public: // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); +//get512BitRegister - X86 utility - returns 512-bit super register +unsigned get512BitSuperRegister(unsigned Reg); + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index fbbb257..b802728 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -26,6 +26,7 @@ let Namespace = "X86" in { def sub_16bit : SubRegIndex<16>; def sub_32bit : SubRegIndex<32>; def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; } //===----------------------------------------------------------------------===// @@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; + } // CostPerUse -// YMM Registers, used by AVX instructions +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { -def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>; -def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>; -def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>; -def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>; -def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>; -def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>; -def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>; -def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>; -def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>; -def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>; -def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>; -def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>; -def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>; -def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>; -def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>; -def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>; + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } } + // Mask Registers, used by AVX-512 instructions. + def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; + def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; + def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; + def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; + def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; + def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; + def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; + def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; + class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { let Aliases = A; } @@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, + (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 31)>; + +def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>; +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>; + +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>; +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; + diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 84c9203..62ba2bc 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -16,9 +16,8 @@ def HaswellModel : SchedMachineModel { // All x86 instructions are modeled as a single micro-op, and HW can decode 4 // instructions per cycle. let IssueWidth = 4; - let MinLatency = 0; // 0 = Out-of-order execution. + let MicroOpBufferSize = 192; // Based on the reorder buffer. let LoadLatency = 4; - let ILPWindow = 30; let MispredictPenalty = 16; } @@ -50,6 +49,12 @@ def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; +// 60 Entry Unified Scheduler +def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, + HWPort5, HWPort6, HWPort7]> { + let BufferSize=60; +} + // Integer division issued on port 0. def HWDivider : ProcResource<1>; @@ -86,6 +91,7 @@ def : WriteRes<WriteZero, []>; defm : HWWriteResPair<WriteALU, HWPort0156, 1>; defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } defm : HWWriteResPair<WriteShift, HWPort056, 1>; defm : HWWriteResPair<WriteJump, HWPort5, 1>; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index b36b3ad..52ead94 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -17,9 +17,8 @@ def SandyBridgeModel : SchedMachineModel { // instructions per cycle. // FIXME: Identify instructions that aren't a single fused micro-op. let IssueWidth = 4; - let MinLatency = 0; // 0 = Out-of-order execution. + let MicroOpBufferSize = 168; // Based on the reorder buffer. let LoadLatency = 4; - let ILPWindow = 20; let MispredictPenalty = 16; } @@ -46,6 +45,11 @@ def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; +// 54 Entry Unified Scheduler +def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> { + let BufferSize=54; +} + // Integer division issued on port 0. def SBDivider : ProcResource<1>; @@ -82,6 +86,7 @@ def : WriteRes<WriteZero, []>; defm : SBWriteResPair<WriteALU, SBPort015, 1>; defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } defm : SBWriteResPair<WriteShift, SBPort05, 1>; defm : SBWriteResPair<WriteJump, SBPort5, 1>; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 9f2c781..ceb2e05 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -42,6 +42,7 @@ multiclass X86SchedWritePair { // Arithmetic. defm WriteALU : X86SchedWritePair; // Simple integer ALU op. defm WriteIMul : X86SchedWritePair; // Integer multiplication. +def WriteIMulH : SchedWrite; // Integer multiplication, high part. defm WriteIDiv : X86SchedWritePair; // Integer division. def WriteLEA : SchedWrite; // LEA instructions can't fold loads. @@ -550,8 +551,9 @@ def IIC_NOP : InstrItinClass; // Resources beyond the decoder operate on micro-ops and are bufferred // so adjacent micro-ops don't directly compete. // -// MinLatency=0 indicates that RAW dependencies can be decoded in the -// same cycle. +// MicroOpBufferSize > 1 indicates that RAW dependencies can be +// decoded in the same cycle. The value 32 is a reasonably arbitrary +// number of in-flight instructions. // // HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef // indicates high latency opcodes. Alternatively, InstrItinData @@ -559,17 +561,12 @@ def IIC_NOP : InstrItinClass; // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// ILPWindow=10 is an arbitrary threshold that approximates cycles of -// latency hidden by instruction buffers. The actual value is not very -// important but should be zero for inorder and nonzero for OOO processors. -// // The GenericModel contains no instruciton itineraries. def GenericModel : SchedMachineModel { let IssueWidth = 4; - let MinLatency = 0; + let MicroOpBufferSize = 32; let LoadLatency = 4; let HighLatency = 10; - let ILPWindow = 10; } include "X86ScheduleAtom.td" diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index cb0960a..14a1471 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -525,11 +525,9 @@ def AtomItineraries : ProcessorItineraries< // Atom machine model. def AtomModel : SchedMachineModel { let IssueWidth = 2; // Allows 2 instructions per scheduling group. - let MinLatency = 1; // InstrStage cycles overrides MinLatency. - // OperandCycles may be used for expected latency. + let MicroOpBufferSize = 0; // In-order execution, always hide latency. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. - let ILPWindow = 0; // Always try to hide expected latency. let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 74da2a9..fae90f2 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -477,6 +477,9 @@ void X86Subtarget::initializeEnvironment() { HasBMI2 = false; HasRTM = false; HasHLE = false; + HasERI = false; + HasCDI = false; + HasPFI=false; HasADX = false; HasPRFCHW = false; HasRDSEED = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 66832b9..8793238 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -42,7 +42,7 @@ enum Style { class X86Subtarget : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512 }; enum X863DNowEnum { @@ -169,6 +169,15 @@ protected: /// address generation (AG) time. bool LEAUsesAG; + /// Processor has AVX-512 PreFetch Instructions + bool HasPFI; + + /// Processor has AVX-512 Exponential and Reciprocal Instructions + bool HasERI; + + /// Processor has AVX-512 Conflict Detection Instructions + bool HasCDI; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -249,6 +258,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasAVX512() const { return X86SSELevel >= AVX512; } bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } @@ -282,6 +292,9 @@ public: bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } + bool hasCDI() const { return HasCDI; } + bool hasPFI() const { return HasPFI; } + bool hasERI() const { return HasERI; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -338,7 +351,13 @@ public: } bool isPICStyleStubAny() const { return PICStyle == PICStyles::StubDynamicNoPIC || - PICStyle == PICStyles::StubPIC; } + PICStyle == PICStyles::StubPIC; + } + + bool isCallingConvWin64(CallingConv::ID CC) const { + return (isTargetWin64() && CC != CallingConv::X86_64_SysV) || + CC == CallingConv::X86_64_Win64; + } /// ClassifyGlobalReference - Classify a global variable reference for the /// current subtarget according to how we should reference it in a non-pcrel diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 0422a61..49ebd1a 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -132,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { // Add first the target-independent BasicTTI pass, then our X86 pass. This // allows the X86 pass to delegate to the target independent layer when // appropriate. - PM.add(createBasicTargetTransformInfoPass(getTargetLowering())); + PM.add(createBasicTargetTransformInfoPass(this)); PM.add(createX86TargetTransformInfoPass(this)); } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 871dacd..a19c5a6 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); } + +const MCExpr * +X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 9d26d38..79c861d 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -36,6 +36,9 @@ namespace llvm { /// and x86-64. class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + + /// \brief Describe a TLS variable address within debug info. + virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const; }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index eba9d78..3bbddad 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &); namespace { class X86TTI : public ImmutablePass, public TargetTransformInfo { - const X86TargetMachine *TM; const X86Subtarget *ST; const X86TargetLowering *TLI; @@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { llvm_unreachable("This pass cannot be directly constructed"); } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), TLI(TM->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } @@ -101,6 +100,8 @@ public: unsigned Alignment, unsigned AddressSpace) const; + virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const; + /// @} }; @@ -196,6 +197,16 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, }; // Look for AVX2 lowering tricks. @@ -258,6 +269,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular + // registers. The overhead of division is going to dominate most kernels + // anyways so try hard to prevent vectorization of division - it is + // generally a bad idea. Assume somewhat arbitrarily that we have to be able + // to hide "20 cycles" for each lane. + { ISD::SDIV, MVT::v16i8, 16*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v16i8, 16*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, }; if (ST->hasSSE2()) { @@ -467,19 +493,22 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, }; if (ST->hasAVX2()) { - int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * AVX2CostTbl[Idx].Cost; } if (ST->hasAVX()) { - int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * AVX1CostTbl[Idx].Cost; } if (ST->hasSSE42()) { - int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), + ISD, MTy); if (Idx != -1) return LT.first * SSE42CostTbl[Idx].Cost; } @@ -511,8 +540,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); } +unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) const { + assert (Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { + // Handle non power of two vectors such as <3 x float> + if (VectorType *VTy = dyn_cast<VectorType>(Src)) { + unsigned NumElem = VTy->getVectorNumElements(); + + // Handle a few common cases: + // <3 x float> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) + // Cost = 64 bit store + extract + 32 bit store. + return 3; + + // <3 x double> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) + // Cost = 128 bit store + unpack + 64 bit store. + return 3; + + // Assume that all other non power-of-two numbers are scalarized. + if (!isPowerOf2_32(NumElem)) { + unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, + VTy->getScalarType(), + Alignment, + AddressSpace); + unsigned SplitCost = getScalarizationOverhead(Src, + Opcode == Instruction::Load, + Opcode==Instruction::Store); + return NumElem * Cost + SplitCost; + } + } + // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -528,3 +600,16 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } + +unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); +} diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 0f77948..477f75a 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { } static bool isYmmReg(unsigned Reg) { - if (Reg >= X86::YMM0 && Reg <= X86::YMM15) - return true; + return (Reg >= X86::YMM0 && Reg <= X86::YMM31); +} - return false; +static bool isZmmReg(unsigned Reg) { + return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmReg(I->first) || isZmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { + for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } |