diff options
Diffstat (limited to 'lib/Target/X86')
38 files changed, 2095 insertions, 1718 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 08646d0..7194dd3 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -1,8 +1,10 @@ LOCAL_PATH := $(call my-dir) x86_codegen_TBLGEN_TABLES := \ + X86GenAsmMatcher.inc \ X86GenAsmWriter.inc \ X86GenAsmWriter1.inc \ + X86GenDisassemblerTables.inc \ X86GenRegisterInfo.inc \ X86GenInstrInfo.inc \ X86GenDAGISel.inc \ diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 0b6fb52..c24805a 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,6 +11,7 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" +#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -664,6 +665,7 @@ private: ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); std::unique_ptr<X86Operand> ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); + std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, SMLoc Start, @@ -1407,6 +1409,35 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, /*Scale=*/1, Start, End, Size, Identifier, Info); } +//ParseRoundingModeOp - Parse AVX-512 rounding mode operand +std::unique_ptr<X86Operand> +X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + consumeToken(); // Eat "{" + if (Tok.getIdentifier().startswith("r")){ + int rndMode = StringSwitch<int>(Tok.getIdentifier()) + .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) + .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF) + .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF) + .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) + .Default(-1); + if (-1 == rndMode) + return ErrorOperand(Tok.getLoc(), "Invalid rounding mode."); + Parser.Lex(); // Eat "r*" of r*-sae + if (!getLexer().is(AsmToken::Minus)) + return ErrorOperand(Tok.getLoc(), "Expected - at this point"); + Parser.Lex(); // Eat "-" + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + const MCExpr *RndModeOp = + MCConstantExpr::Create(rndMode, Parser.getContext()); + return X86Operand::CreateImm(RndModeOp, Start, End); + } + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); +} /// ParseIntelMemOperand - Parse intel style memory operand. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, @@ -1656,6 +1687,11 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelMemOperand(Imm, Start, Size); } + // rounding mode token + if (STI.getFeatureBits() & X86::FeatureAVX512 && + getLexer().is(AsmToken::LCurly)) + return ParseRoundingModeOp(Start, End); + // Register. unsigned RegNo = 0; if (!ParseRegister(RegNo, Start, End)) { @@ -1708,6 +1744,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { return nullptr; return X86Operand::CreateImm(Val, Start, End); } + case AsmToken::LCurly:{ + SMLoc Start = Parser.getTok().getLoc(), End; + if (STI.getFeatureBits() & X86::FeatureAVX512) + return ParseRoundingModeOp(Start, End); + return ErrorOperand(Start, "unknown token in expression"); + } } } diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index d67e119..94dbedb 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -260,6 +260,9 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && !getMemIndexReg() && getMemScale() == 1; } + bool isAVX512RC() const{ + return isImm(); + } bool isAbsMem16() const { return isAbsMem() && Mem.ModeSize == 16; @@ -394,7 +397,10 @@ struct X86Operand : public MCParsedAsmOperand { RegNo = getGR32FromGR64(RegNo); Inst.addOperand(MCOperand::CreateReg(RegNo)); } - + void addAVX512RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 99fb1ab..e8c5475 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -378,26 +378,28 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, unsigned NewOpc; switch (mcInst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); - case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; - case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; - case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; - case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; - case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; - case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; - case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; - case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; - case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; - case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; - case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; - case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; - case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; - case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; - case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; - case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; - case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; } // Switch opcode to the one that doesn't get special printing. mcInst.setOpcode(NewOpc); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 619a0d4..7c9e012 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -310,11 +310,8 @@ static bool isPrefixAtLocation(struct InternalInstruction* insn, uint8_t prefix, uint64_t location) { - if (insn->prefixPresent[prefix] == 1 && - insn->prefixLocations[prefix] == location) - return true; - else - return false; + return insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location; } /* @@ -1458,6 +1455,8 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_VK1: \ case TYPE_VK8: \ case TYPE_VK16: \ + if (index > 7) \ + *valid = 0; \ return prefix##_K0 + index; \ case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 70c6042..9e65050 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -485,18 +485,6 @@ struct OperandSpecifier { uint8_t type; }; -// Indicates where the opcode modifier (if any) is to be found. Extended -// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. -#define MODIFIER_TYPES \ - ENUM_ENTRY(MODIFIER_NONE) - -#define ENUM_ENTRY(n) n, -enum ModifierType { - MODIFIER_TYPES - MODIFIER_max -}; -#undef ENUM_ENTRY - static const unsigned X86_MAX_OPERANDS = 6; /// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 719b761..a400d46 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -76,8 +76,8 @@ class X86AsmBackend : public MCAsmBackend { bool HasNopl; const uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) + : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -351,8 +351,8 @@ namespace { class ELFX86AsmBackend : public X86AsmBackend { public: uint8_t OSABI; - ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU) - : X86AsmBackend(T, CPU), OSABI(_OSABI) {} + ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(OSABI) {} }; class ELFX86_32AsmBackend : public ELFX86AsmBackend { diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index e8b0b4c..76a9d2b 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -38,231 +38,214 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, X86ELFObjectWriter::~X86ELFObjectWriter() {} -unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { - // determine the type of the relocation +enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; - MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); - unsigned Type; - if (getEMachine() == ELF::EM_X86_64) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - - case FK_Data_8: Type = ELF::R_X86_64_PC64; break; - case FK_Data_4: Type = ELF::R_X86_64_PC32; break; - case FK_Data_2: Type = ELF::R_X86_64_PC16; break; - case FK_Data_1: Type = ELF::R_X86_64_PC8; break; +static X86_64RelType getType64(unsigned Kind, + MCSymbolRefExpr::VariantKind &Modifier, + bool &IsPCRel) { + switch (Kind) { + default: + llvm_unreachable("Unimplemented"); + case X86::reloc_global_offset_table8: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_64; + case FK_Data_8: + return RT64_64; + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel) + return RT64_32S; + return RT64_32; + case X86::reloc_global_offset_table: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_32; + case FK_Data_4: + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return RT64_32; + case FK_Data_2: + return RT64_16; + case FK_PCRel_1: + case FK_Data_1: + return RT64_8; + } +} - case FK_PCRel_8: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC64; - break; - case X86::reloc_signed_4byte: - case X86::reloc_riprel_4byte_movq_load: - case X86::reloc_riprel_4byte: - case FK_PCRel_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_X86_64_PLT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_X86_64_GOTTPOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_X86_64_TLSGD; - break; - case MCSymbolRefExpr::VK_TLSLD: - Type = ELF::R_X86_64_TLSLD; - break; - } - break; - case FK_PCRel_2: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC16; - break; - case FK_PCRel_1: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC8; - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - case X86::reloc_global_offset_table8: - Type = ELF::R_X86_64_GOTPC64; - break; - case X86::reloc_global_offset_table: - Type = ELF::R_X86_64_GOTPC32; - break; - case FK_Data_8: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_64; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT64; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_X86_64_GOTOFF64; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF64; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF64; - break; - } - break; - case X86::reloc_signed_4byte: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_32S; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF32; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF32; - break; - } - break; - case FK_Data_4: - Type = ELF::R_X86_64_32; - break; - case FK_Data_2: Type = ELF::R_X86_64_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_X86_64_8; break; - } +static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier, + X86_64RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; + case RT64_32S: + return ELF::R_X86_64_32S; + case RT64_16: + return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16; + case RT64_8: + return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; } - } else if (getEMachine() == ELF::EM_386) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; - - case FK_PCRel_1: - case FK_Data_1: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC8; - break; - } - break; - - case FK_PCRel_2: - case FK_Data_2: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC16; - break; - } - break; + case MCSymbolRefExpr::VK_GOT: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT64_64); + assert(!IsPCRel); + return ELF::R_X86_64_GOTOFF64; + case MCSymbolRefExpr::VK_TPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_TPOFF64; + case RT64_32: + return ELF::R_X86_64_TPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_DTPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_DTPOFF64; + case RT64_32: + return ELF::R_X86_64_DTPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_SIZE: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_SIZE64; + case RT64_32: + return ELF::R_X86_64_SIZE32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSGD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTTPOFF; + case MCSymbolRefExpr::VK_TLSLD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSLD; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT64_32); + return ELF::R_X86_64_PLT32; + case MCSymbolRefExpr::VK_GOTPCREL: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTPCREL; + } +} - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - } - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); +enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; +static X86_32RelType getType32(X86_64RelType T) { + switch (T) { + case RT64_64: + llvm_unreachable("Unimplemented"); + case RT64_32: + case RT64_32S: + return RT32_32; + case RT64_16: + return RT32_16; + case RT64_8: + return RT32_8; + } + llvm_unreachable("unexpected relocation type!"); +} - // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode - // instead? - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_32; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_386_GOT32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_386_GOTOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_386_TLS_GD; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_386_TLS_LE_32; - break; - case MCSymbolRefExpr::VK_INDNTPOFF: - Type = ELF::R_386_TLS_IE; - break; - case MCSymbolRefExpr::VK_NTPOFF: - Type = ELF::R_386_TLS_LE; - break; - case MCSymbolRefExpr::VK_GOTNTPOFF: - Type = ELF::R_386_TLS_GOTIE; - break; - case MCSymbolRefExpr::VK_TLSLDM: - Type = ELF::R_386_TLS_LDM; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_386_TLS_LDO_32; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_386_TLS_IE_32; - break; - } - break; - case FK_Data_2: Type = ELF::R_386_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_386_8; break; - } +static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier, + X86_32RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT32_32: + return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; + case RT32_16: + return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16; + case RT32_8: + return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; } - } else - llvm_unreachable("Unsupported ELF machine type."); + case MCSymbolRefExpr::VK_GOT: + assert(Type == RT32_32); + return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32; + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE_32; + case MCSymbolRefExpr::VK_DTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDO_32; + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE_32; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT32_32); + return ELF::R_386_PLT32; + case MCSymbolRefExpr::VK_INDNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE; + case MCSymbolRefExpr::VK_NTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE; + case MCSymbolRefExpr::VK_GOTNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GOTIE; + case MCSymbolRefExpr::VK_TLSLDM: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDM; + } +} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel); + if (getEMachine() == ELF::EM_X86_64) + return getRelocType64(Modifier, Type, IsPCRel); - return Type; + assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + return getRelocType32(Modifier, getType32(Type), IsPCRel); } MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index b679316..10b83f4 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -36,7 +36,7 @@ public: MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 3ad8ab1..9b98a3e 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -168,10 +168,8 @@ public: } // end anonymous namespace - MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new X86MCCodeEmitter(MCII, Ctx); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 0e7b4e5..0946326 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -55,143 +55,6 @@ std::string X86_MC::ParseX86Triple(StringRef TT) { return FS; } -/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the -/// specified arguments. If we can't run cpuid on the host, return true. -bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - int registers[4]; - __cpuid(registers, value); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the -/// 4 values in the specified arguments. If we can't run cpuid on the host, -/// return true. -bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - mov ecx,subleaf - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, - unsigned &Model) { - Family = (EAX >> 8) & 0xf; // Bits 8 - 11 - Model = (EAX >> 4) & 0xf; // Bits 4 - 7 - if (Family == 6 || Family == 0xf) { - if (Family == 0xf) - // Examine extended family ID if family ID is F. - Family += (EAX >> 20) & 0xff; // Bits 20 - 27 - // Examine extended model ID if family ID is 6 or F. - Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 - } -} - unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; @@ -344,24 +207,6 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, return X; } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - Triple TheTriple(TT); - - switch (TheTriple.getObjectFormat()) { - default: llvm_unreachable("unsupported object format"); - case Triple::MachO: - return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - case Triple::COFF: - assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); - return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); - case Triple::ELF: - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - } -} - static MCInstPrinter *createX86MCInstPrinter(const Target &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -392,61 +237,42 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeX86TargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo); - RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo); - - // Register the MC codegen info. - RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo); - RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target, - X86_MC::createX86MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target, - X86_MC::createX86MCSubtargetInfo); - - // Register the MC instruction analyzer. - TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target, - createX86MCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target, - createX86MCInstrAnalysis); - - // Register the code emitter. - TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target, - createX86MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target, - createX86MCCodeEmitter); + for (Target *T : {&TheX86_32Target, &TheX86_64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter); + + // Register the object streamer. + TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo); + } // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, createX86_32AsmBackend); TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target, - createMCStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheX86_32Target, - createX86MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, - createX86MCInstPrinter); - - // Register the MC relocation info. - TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target, - createX86MCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target, - createX86MCRelocationInfo); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index d8320b9..6f50f11 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -53,18 +53,6 @@ namespace N86 { namespace X86_MC { std::string ParseX86Triple(StringRef TT); - /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in - /// the specified arguments. If we can't run cpuid on the host, return true. - bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return - /// the 4 values in the specified arguments. If we can't run cpuid on the - /// host, return true. - bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - - void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); - unsigned getDwarfRegFlavour(Triple TT, bool isEH); void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); @@ -78,7 +66,6 @@ namespace X86_MC { MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, @@ -86,12 +73,12 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); -/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code -/// streamer which will generate PE/COFF format object files. +/// Construct an X86 Windows COFF machine code streamer which will generate +/// PE/COFF format object files. /// /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, + raw_ostream &OS, MCCodeEmitter *CE, bool RelaxAll); /// createX86MachObjectWriter - Construct an X86 Mach-O object writer. diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index 3b81d53..81749fc 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -38,7 +38,7 @@ public: MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; @@ -93,7 +93,7 @@ public: RSymI->getName(RSymName); MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName); - if (RSym->isVariable() == false) + if (!RSym->isVariable()) RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx)); const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx); diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 5f1596c..5690efe 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -48,13 +48,11 @@ void X86WinCOFFStreamer::FinishImpl() { } } -namespace llvm { -MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, - bool RelaxAll) { +MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + raw_ostream &OS, MCCodeEmitter *CE, + bool RelaxAll) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); return S; } -} diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 71329b0..e6896e8 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -93,36 +93,6 @@ The pattern isel got this one right. //===---------------------------------------------------------------------===// -SSE should implement 'select_cc' using 'emulated conditional moves' that use -pcmp/pand/pandn/por to do a selection instead of a conditional branch: - -double %X(double %Y, double %Z, double %A, double %B) { - %C = setlt double %A, %B - %z = fadd double %Z, 0.0 ;; select operand is not a load - %D = select bool %C, double %Y, double %z - ret double %D -} - -We currently emit: - -_X: - subl $12, %esp - xorpd %xmm0, %xmm0 - addsd 24(%esp), %xmm0 - movsd 32(%esp), %xmm1 - movsd 16(%esp), %xmm2 - ucomisd 40(%esp), %xmm1 - jb LBB_X_2 -LBB_X_1: - movsd %xmm0, %xmm2 -LBB_X_2: - movsd %xmm2, (%esp) - fldl (%esp) - addl $12, %esp - ret - -//===---------------------------------------------------------------------===// - Lower memcpy / memset to a series of SSE 128 bit move instructions when it's feasible. @@ -787,25 +757,6 @@ cheaper to do fld1 than load from a constant pool for example, so //===---------------------------------------------------------------------===// -The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to -"cmpsd". For example, this code: - -double d1(double x) { return x == x ? x : x + x; } - -Compiles into: - -_d1: - ucomisd %xmm0, %xmm0 - jnp LBB1_2 - addsd %xmm0, %xmm0 - ret -LBB1_2: - ret - -Also, the 'ret's should be shared. This is PR6032. - -//===---------------------------------------------------------------------===// - These should compile into the same code (PR6214): Perhaps instcombine should canonicalize the former into the later? @@ -858,35 +809,6 @@ doing a shuffle from v[1] to v[0] then a float store. //===---------------------------------------------------------------------===// -On SSE4 machines, we compile this code: - -define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, - <2 x float> *%P) nounwind { - %Z = fadd <2 x float> %Q, %R - - store <2 x float> %Z, <2 x float> *%P - ret <2 x float> %Z -} - -into: - -_test2: ## @test2 -## BB#0: - insertps $0, %xmm2, %xmm2 - insertps $16, %xmm3, %xmm2 - insertps $0, %xmm0, %xmm3 - insertps $16, %xmm1, %xmm3 - addps %xmm2, %xmm3 - movq %xmm3, (%rdi) - movaps %xmm3, %xmm0 - pshufd $1, %xmm3, %xmm1 - ## kill: XMM1<def> XMM1<kill> - ret - -The insertps's of $0 are pointless complex copies. - -//===---------------------------------------------------------------------===// - [UNSAFE FP] void foo(double, double, double); diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index bb0b9ce..f6033a7 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -63,9 +63,6 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer.EndCOFFSymbolDef(); } - // Have common code print out the function header with linkage info etc. - EmitFunctionHeader(); - // Emit the rest of the function body. EmitFunctionBody(); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index a17f052..cba140f 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -84,7 +84,7 @@ private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, - unsigned &ResultReg); + unsigned &ResultReg, unsigned Alignment = 1); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); @@ -327,7 +327,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - MachineMemOperand *MMO, unsigned &ResultReg) { + MachineMemOperand *MMO, unsigned &ResultReg, + unsigned Alignment) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -372,6 +373,30 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, case MVT::f80: // No f80 support yet. return false; + case MVT::v4f32: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm; + RC = &X86::VR128RegClass; + break; + case MVT::v2f64: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm; + RC = &X86::VR128RegClass; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm; + RC = &X86::VR128RegClass; + break; } ResultReg = createResultReg(RC); @@ -1068,8 +1093,14 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86SelectAddress(Ptr, AM)) return false; + unsigned Alignment = LI->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + unsigned ResultReg = 0; - if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, + Alignment)) return false; updateValueMap(I, ResultReg); @@ -1094,20 +1125,30 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { } } -/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS -/// of the comparison, return an opcode that works for the compare (e.g. -/// CMP32ri) otherwise return 0. +/// If we have a comparison with RHS as the RHS of the comparison, return an +/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + int64_t Val = RHSC->getSExtValue(); switch (VT.getSimpleVT().SimpleTy) { // Otherwise, we can't fold the immediate into this comparison. - default: return 0; - case MVT::i8: return X86::CMP8ri; - case MVT::i16: return X86::CMP16ri; - case MVT::i32: return X86::CMP32ri; + default: + return 0; + case MVT::i8: + return X86::CMP8ri; + case MVT::i16: + if (isInt<8>(Val)) + return X86::CMP16ri8; + return X86::CMP16ri; + case MVT::i32: + if (isInt<8>(Val)) + return X86::CMP32ri8; + return X86::CMP32ri; case MVT::i64: + if (isInt<8>(Val)) + return X86::CMP64ri8; // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext // field. - if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + if (isInt<32>(Val)) return X86::CMP64ri32; return 0; } @@ -1810,11 +1851,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return true; } -/// \brief Emit SSE instructions to lower the select. +/// \brief Emit SSE or AVX instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary -/// SSE instructions are available. +/// SSE instructions are available. If AVX is available, try to use a VBLENDV. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have @@ -1850,19 +1891,17 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - static unsigned OpcTable[2][2][4] = { - { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, - { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, - { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + // Choose the SSE instruction sequence based on data type (float or double). + static unsigned OpcTable[2][4] = { + { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } }; - bool HasAVX = Subtarget->hasAVX(); unsigned *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; - case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; - case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; } const Value *LHS = I->getOperand(1); @@ -1884,14 +1923,33 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, - CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, - LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, - RHSReg, RHSIsKill); - unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned ResultReg; + + if (Subtarget->hasAVX()) { + // If we have AVX, create 1 blendv instead of 3 logic instructions. + // Blendv was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + unsigned CmpOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + unsigned BlendOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + } else { + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + } updateValueMap(I, ResultReg); return true; } @@ -2015,38 +2073,30 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) { if (OpReg == 0) return false; - bool HasAVX = Subtarget->hasAVX(); const TargetRegisterClass *RC = nullptr; unsigned Opcode; - if (I->getType()->isDoubleTy() && X86ScalarSSEf64) { + if (I->getType()->isDoubleTy()) { // sitofp int -> double - Opcode = HasAVX ? X86::VCVTSI2SDrr : X86::CVTSI2SDrr; + Opcode = X86::VCVTSI2SDrr; RC = &X86::FR64RegClass; - } else if (I->getType()->isFloatTy() && X86ScalarSSEf32) { + } else if (I->getType()->isFloatTy()) { // sitofp int -> float - Opcode = HasAVX ? X86::VCVTSI2SSrr : X86::CVTSI2SSrr; + Opcode = X86::VCVTSI2SSrr; RC = &X86::FR32RegClass; } else return false; + // The target-independent selection algorithm in FastISel already knows how + // to select a SINT_TO_FP if the target is SSE but not AVX. This code is only + // reachable if the subtarget has AVX. + assert(Subtarget->hasAVX() && "Expected a subtarget with AVX!"); - unsigned ImplicitDefReg = 0; - if (HasAVX) { - ImplicitDefReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); - } - - const MCInstrDesc &II = TII.get(Opcode); - OpReg = constrainOperandRegClass(II, OpReg, (HasAVX ? 2 : 1)); - - unsigned ResultReg = createResultReg(RC); - MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg); - if (ImplicitDefReg) - MIB.addReg(ImplicitDefReg, RegState::Kill); - MIB.addReg(OpReg); + unsigned ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + unsigned ResultReg = + fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); updateValueMap(I, ResultReg); return true; } @@ -3053,7 +3103,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index c8e5f64..3b0bd03 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -32,10 +32,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" @@ -300,7 +300,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // function. If it is all integer, there is nothing for us to do! bool FPIsUsed = false; - assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); for (unsigned i = 0; i <= 6; ++i) if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { FPIsUsed = true; @@ -438,7 +438,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // Rewind to first instruction newly inserted. while (Start != BB.begin() && std::prev(Start) != PrevI) --Start; dbgs() << "Inserted instructions:\n\t"; - Start->print(dbgs(), &MF.getTarget()); + Start->print(dbgs()); while (++Start != std::next(I)) {} } dumpStack(); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index cead099..1d2c73c 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -581,7 +581,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); - bool IsWin64 = STI.isTargetWin64(); + bool IsWin64 = STI.isCallingConvWin64(Fn->getCallingConv()); // Not necessarily synonymous with IsWin64. bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8d50ae1..fb12ce5 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -228,7 +228,7 @@ namespace { /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; void EmitSpecialCodeForMain(); @@ -1004,6 +1004,15 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, switch (N.getOpcode()) { default: break; + case ISD::FRAME_ALLOC_RECOVER: { + if (!AM.hasSymbolicDisplacement()) + if (const auto *ESNode = dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) + if (ESNode->getOpcode() == ISD::TargetExternalSymbol) { + AM.ES = ESNode->getSymbol(); + return false; + } + break; + } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); if (!FoldOffsetIntoAddress(Val, AM)) @@ -2805,14 +2814,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } bool X86DAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1, Op2, Op3, Op4; - switch (ConstraintCode) { - case 'o': // offsetable ?? - case 'v': // not offsetable ?? + switch (ConstraintID) { + case InlineAsm::Constraint_o: // offsetable ?? + case InlineAsm::Constraint_v: // not offsetable ?? default: return true; - case 'm': // memory + case InlineAsm::Constraint_m: // memory if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6866be7..8b92e70 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -77,119 +76,6 @@ static cl::opt<int> ReciprocalEstimateRefinementSteps( static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl, - unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/vectorWidth; - EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, - VT.getVectorNumElements()/Factor); - - // Extract from UNDEF is UNDEF. - if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getUNDEF(ResultVT); - - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR - unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); - - // If the input is a buildvector just emit a smaller one. - if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); -} - -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 -/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 -/// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert((Vec.getValueType().is256BitVector() || - Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); -} - -/// Generate a DAG to grab 256-bits from a 512-bit vector. -static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); -} - -static SDValue InsertSubVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl, unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - // Inserting UNDEF is Result - if (Vec.getOpcode() == ISD::UNDEF) - return Result; - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - EVT ResultVT = Result.getValueType(); - - // Insert the relevant vectorWidth bits. - unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); -} - -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or -/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG,SDLoc dl) { - assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); -} - -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); -} - -/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 -/// instructions. This is used because creating CONCAT_VECTOR nodes of -/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower -/// large BUILD_VECTORS. -static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert128BitVector(V, V2, NumElems/2, DAG, dl); -} - -static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert256BitVector(V, V2, NumElems/2, DAG, dl); -} - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -871,35 +757,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. - setOperationAction(ISD::MULHS, MVT::v8i8, Expand); - setOperationAction(ISD::MULHS, MVT::v4i16, Expand); - setOperationAction(ISD::MULHS, MVT::v2i32, Expand); - setOperationAction(ISD::MULHS, MVT::v1i64, Expand); - setOperationAction(ISD::AND, MVT::v8i8, Expand); - setOperationAction(ISD::AND, MVT::v4i16, Expand); - setOperationAction(ISD::AND, MVT::v2i32, Expand); - setOperationAction(ISD::AND, MVT::v1i64, Expand); - setOperationAction(ISD::OR, MVT::v8i8, Expand); - setOperationAction(ISD::OR, MVT::v4i16, Expand); - setOperationAction(ISD::OR, MVT::v2i32, Expand); - setOperationAction(ISD::OR, MVT::v1i64, Expand); - setOperationAction(ISD::XOR, MVT::v8i8, Expand); - setOperationAction(ISD::XOR, MVT::v4i16, Expand); - setOperationAction(ISD::XOR, MVT::v2i32, Expand); - setOperationAction(ISD::XOR, MVT::v1i64, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { + setOperationAction(ISD::MULHS, MMXTy, Expand); + setOperationAction(ISD::AND, MMXTy, Expand); + setOperationAction(ISD::OR, MMXTy, Expand); + setOperationAction(ISD::XOR, MMXTy, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); + setOperationAction(ISD::SELECT, MMXTy, Expand); + setOperationAction(ISD::BITCAST, MMXTy, Expand); + } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); - setOperationAction(ISD::SELECT, MVT::v8i8, Expand); - setOperationAction(ISD::SELECT, MVT::v4i16, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v1i64, Expand); - setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); - setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); - setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); - setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); @@ -1065,27 +932,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FRINT, MVT::v4f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); - setOperationAction(ISD::FRINT, MVT::v2f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + } // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1474,7 +1327,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); @@ -1576,6 +1428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1599,7 +1455,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -3189,7 +3048,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3906,21 +3765,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, - unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors @@ -4083,9 +3927,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { - assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); + + assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) + && "Unexpected vector type"); + assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) + && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, MVT::i1); - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -4093,6 +3941,162 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits()/vectorWidth; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(ResultVT); + + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) + * ElemsPerChunk); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, + ElemsPerChunk)); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); +} + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); +} + +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + + // For insertion into the zero index (low half) of a 256-bit vector, it is + // more efficient to generate a blend with immediate instead of an insert*128. + // We are still creating an INSERT_SUBVECTOR below with an undef node to + // extend the subvector to the size of the result vector. Make sure that + // we are not recursing on that node by checking for undef here. + if (IdxVal == 0 && Result.getValueType().is256BitVector() && + Result.getOpcode() != ISD::UNDEF) { + EVT ResultVT = Result.getValueType(); + SDValue ZeroIndex = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(ResultVT); + SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, + Vec, ZeroIndex); + + // The blend instruction, and therefore its mask, depend on the data type. + MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + if (ScalarType.isFloatingPoint()) { + // Choose either vblendps (float) or vblendpd (double). + unsigned ScalarSize = ScalarType.getSizeInBits(); + assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); + unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; + SDValue Mask = DAG.getConstant(MaskVal, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); + } + + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + + // AVX2 is needed for 256-bit integer blend support. + // Integers must be cast to 32-bit because there is only vpblendd; + // vpblendw can't be used for this because it has a handicapped mask. + + // If we don't have AVX2, then cast to float. Using a wrong domain blend + // is still more efficient than using the wrong domain vinsertf128 that + // will be created by InsertSubVector(). + MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; + + SDValue Mask = DAG.getConstant(0x0f, MVT::i8); + Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256); + Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); + return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256); + } + + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); +} + +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + /// getOnesVector - Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. @@ -5567,8 +5571,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); - if (Broadcast.getNode()) + if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -5635,12 +5638,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector() || VT.is512BitVector()) { + if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } - assert(VT.is128BitVector() && "Expected an SSE value type!"); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); @@ -5742,24 +5746,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // If element VT is < 32 bits, convert it to inserts into a zero vector. - if (EVTBits == 8 && NumElems == 16) { - SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 8 && NumElems == 16) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; - if (EVTBits == 16 && NumElems == 8) { - SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 16 && NumElems == 8) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS - if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); - if (V.getNode()) + if (EVTBits == 32 && NumElems == 4) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) return V; - } // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector<SDValue, 8> V(NumElems); @@ -5807,13 +5807,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); - if (LD.getNode()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; // Check for a build vector from mostly shuffle plus few inserting. - SDValue Sh = buildFromShuffleMostly(Op, DAG); - if (Sh.getNode()) + if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. @@ -5893,8 +5891,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); +static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG & DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + unsigned NumOfOperands = Op.getNumOperands(); + + assert(isPowerOf2_32(NumOfOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (NumOfOperands > 2) { + MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + ResVT.getVectorNumElements()/2); + SmallVector<SDValue, 2> Ops; + for (unsigned i = 0; i < NumOfOperands/2; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + Ops.clear(); + for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); + } + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); + + if (IsZeroV1 && IsZeroV2) + return getZeroVector(ResVT, Subtarget, DAG, dl); + + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(ResVT); + unsigned NumElems = ResVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); + V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); + if (IsZeroV1) + return V2; + + V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + // Zero the upper bits of V1 + V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); + V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); + if (IsZeroV2) + return V1; + return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() == MVT::i1) + return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); + assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); @@ -6935,8 +6989,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, "a sorted mask where the broadcast " "comes from V1."); - // Go up the chain of (vector) values to try and find a scalar load that - // we can combine with the broadcast. + // Go up the chain of (vector) values to find a scalar load that we can + // combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { @@ -6973,12 +7027,12 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); - // If the scalar isn't a load we can't broadcast from it in AVX1, only with - // AVX2. + // If the scalar isn't a load, we can't broadcast from it in AVX1. + // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { - // We can't broadcast from a vector register w/o AVX2, and we can only + // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } @@ -7689,10 +7743,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. +/// +/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each +/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to +/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 +/// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( - SDLoc DL, SDValue V, MutableArrayRef<int> Mask, + SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + + assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); @@ -7845,9 +7907,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // Adjust the mask to match the new locations of A and B. @@ -7859,8 +7921,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), - Mask); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, + DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -8083,15 +8145,15 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // At this point, each half should contain all its inputs, and we can then @@ -8105,7 +8167,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DAG)); // Do a half shuffle with the high mask after shifting its values down. @@ -8113,7 +8175,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DAG)); return V; @@ -8232,8 +8294,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return Rotate; - return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, + Subtarget, DAG); } assert(std::any_of(Mask.begin(), Mask.end(), isV1) && @@ -8946,7 +9008,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be - // less expensive. The flags track wether the given lane contains an element + // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; for (int i = 0, Size = Mask.size(); i < Size; ++i) @@ -8986,34 +9048,78 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + // TODO: If minimizing size and one of the inputs is a zero vector and the + // the zero vector has only one use, we could use a VPERM2X128 to save the + // instruction bytes needed to explicitly generate the zero vector. + // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - // Check for patterns which can be matched with a single insert of a 128-bit - // subvector. - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) || - isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); - } - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, - DAG.getIntPtrConstant(2)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. + if (!IsV1Zero && !IsV2Zero) { + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + } + + // Otherwise form a 128-bit permutation. After accounting for undefs, + // convert the 64-bit shuffle mask selection values into 128-bit + // selection bits by dividing the indexes by 2 and shifting into positions + // defined by a vperm2*128 instruction's immediate control byte. + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + int MaskLO = Mask[0]; + if (MaskLO == SM_SentinelUndef) + MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; + + int MaskHI = Mask[2]; + if (MaskHI == SM_SentinelUndef) + MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; + + unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + + // If either input is a zero vector, replace it with an undef input. + // Shuffle mask values < 4 are selecting elements of V1. + // Shuffle mask values >= 4 are selecting elements of V2. + // Adjust each half of the permute mask by clearing the half that was + // selecting the zero vector and setting the zero mask bit. + if (IsV1Zero) { + V1 = DAG.getUNDEF(VT); + if (MaskLO < 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI < 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + if (IsV2Zero) { + V2 = DAG.getUNDEF(VT); + if (MaskLO >= 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI >= 4) + PermMask = (PermMask & 0x0f) | 0x80; } - // Otherwise form a 128-bit permutation. - // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. - unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, MVT::i8)); } @@ -9326,6 +9432,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; }); + if (NumV2Elements == 1 && Mask[0] >= 8) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -9557,6 +9672,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // As this is a single-input shuffle, the repeated mask should be + // a strictly valid v8i16 mask that we can pass through to the v8i16 + // lowering to handle even the v16 case. + return lowerV8I16GeneralSingleInputVectorShuffle( + DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); + } + SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { @@ -10118,8 +10242,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. - SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG); - if (BlendOp.getNode()) + if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // Variable blends are only legal from SSE4.1 onward. @@ -10421,17 +10544,31 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - // Get the desired 128-bit vector half. + // With a 256-bit vector, we can insert into the zero element efficiently + // using a blend if we have AVX or AVX2 and the right data type. + if (VT.is256BitVector() && IdxVal == 0) { + // TODO: It is worthwhile to cast integer to floating point and back + // and incur a domain crossing penalty if that's what we'll end up + // doing anyway after extracting to a 128-bit vector. + if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget->hasAVX2() && EltVT == MVT::i32)) { + SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + N2 = DAG.getIntPtrConstant(1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + } + } + + // Get the desired 128-bit vector chunk. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); - // Insert the element into the desired half. + // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); - // Insert the changed part back to the 256-bit vector + // Insert the changed part back into the bigger vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); @@ -10456,16 +10593,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } if (EltVT == MVT::f32) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into - // these - // bits. For example (insert (extract, 3), 2) could be matched by - // putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these bits. For example (insert (extract, 3), 2) could be matched by + // putting the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. + + const Function *F = DAG.getMachineFunction().getFunction(); + bool MinSize = F->hasFnAttribute(Attribute::MinSize); + if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + // If this is an insertion of 32-bits into the low 32-bits of + // a vector, we prefer to generate a blend with immediate rather + // than an insertps. Blends are simpler operations in hardware and so + // will always have equal or better performance than insertps. + // But if optimizing for size and there's a load folding opportunity, + // generate insertps because blendps does not have a 32-bit memory + // operand form. + N2 = DAG.getIntPtrConstant(1); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + } N2 = DAG.getIntPtrConstant(IdxVal << 4); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); @@ -10593,6 +10743,37 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + if (OpVT.getVectorElementType() == MVT::i1) { + if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal + return Op; + SDValue ZeroIdx = DAG.getIntPtrConstant(0); + SDValue Undef = DAG.getUNDEF(OpVT); + unsigned NumElems = OpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); + + if (IdxVal == OpVT.getVectorNumElements() / 2) { + // Zero upper bits of the Vec + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + if (IdxVal == 0) { + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + // Zero upper bits of the Vec2 + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); + // Zero lower bits of the Vec + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + } return SDValue(); } @@ -13149,9 +13330,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op1.getValueType(); SDValue CC; - // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops - // are available. Otherwise fp cmovs get lowered into a less efficient branch - // sequence later on. + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available or VBLENDV if AVX is available. + // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && @@ -13166,8 +13347,42 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(SSECC, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); + + // If we have AVX, we can use a variable vector select (VBLENDV) instead + // of 3 logic instructions for size savings and potentially speed. + // Unfortunately, there is no scalar form of VBLENDV. + + // If either operand is a constant, don't try this. We can expect to + // optimize away at least one of the logic instructions later in that + // case, so that sequence would be faster than a variable blend. + + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + + if (Subtarget->hasAVX() && + !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { + + // Convert to vectors, do a VSELECT, and convert back to scalar. + // All of the conversions should be optimized away. + + EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); + SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); + SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); + + EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); + + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + VSel, DAG.getIntPtrConstant(0)); + } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); @@ -14595,6 +14810,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/instruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: case Intrinsic::x86_avx512_mask_valign_d_512: // Vector source operands are swapped. @@ -16039,21 +16261,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - SDValue V; assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); - V = LowerScalarImmediateShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; - V = LowerScalarVariableShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) return Op; + // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && @@ -16068,6 +16288,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return Op; } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the + // shifts per-lane and then shuffle the partial results back together. + if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { + // Splat the shift amounts so the scalar shifts above will catch it. + SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); + SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); + SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); + return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -16238,7 +16469,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); - } + } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { @@ -16254,12 +16485,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDValue Amt1, Amt2; if (Amt.getOpcode() == ISD::BUILD_VECTOR) { // Constant shift amount - SmallVector<SDValue, 4> Amt1Csts; - SmallVector<SDValue, 4> Amt2Csts; - for (unsigned i = 0; i != NumElems/2; ++i) - Amt1Csts.push_back(Amt->getOperand(i)); - for (unsigned i = NumElems/2; i != NumElems; ++i) - Amt2Csts.push_back(Amt->getOperand(i)); + SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); + ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); + ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); @@ -16386,14 +16614,17 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { return needsCmpXchgNb(PTy->getElementType()); } -bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. - if (MemType->getPrimitiveSizeInBits() > NativeWidth) - return needsCmpXchgNb(MemType); + if (MemType->getPrimitiveSizeInBits() > NativeWidth) { + return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; + } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { @@ -16403,13 +16634,14 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return false; + return AtomicRMWExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty(); + return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -16417,7 +16649,7 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return true; + return AtomicRMWExpansionKind::CmpXChg; } } @@ -16874,7 +17106,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); @@ -17719,7 +17951,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, // 9 ) EFLAGS (implicit-def) assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); - assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + static_assert(X86::AddrNumOperands == 5, + "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI->getOperand(0).getReg(); MachineOperand &Base = MI->getOperand(1); @@ -18095,6 +18328,92 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); + + // We also lower double CMOVs: + // (CMOV (CMOV F, T, cc1), T, cc2) + // to two successives branches. For that, we look for another CMOV as the + // following instruction. + // + // Without this, we would add a PHI between the two jumps, which ends up + // creating a few copies all around. For instance, for + // + // (sitofp (zext (fcmp une))) + // + // we would generate: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // movaps %xmm0, %xmm1 + // jne .LBB5_2 + // xorps %xmm1, %xmm1 + // .LBB5_2: + // jp .LBB5_4 + // movaps %xmm1, %xmm0 + // .LBB5_4: + // retq + // + // because this custom-inserter would have generated: + // + // A + // | \ + // | B + // | / + // C + // | \ + // | D + // | / + // E + // + // A: X = ...; Y = ... + // B: empty + // C: Z = PHI [X, A], [Y, B] + // D: empty + // E: PHI [X, C], [Z, D] + // + // If we lower both CMOVs in a single step, we can instead generate: + // + // A + // | \ + // | C + // | /| + // |/ | + // | | + // | D + // | / + // E + // + // A: X = ...; Y = ... + // D: empty + // E: PHI [X, A], [X, C], [Y, D] + // + // Which, in our sitofp/fcmp example, gives us something like: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // jne .LBB5_4 + // jp .LBB5_4 + // xorps %xmm0, %xmm0 + // .LBB5_4: + // retq + // + MachineInstr *NextCMOV = nullptr; + MachineBasicBlock::iterator NextMIIt = + std::next(MachineBasicBlock::iterator(MI)); + if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) + NextCMOV = &*NextMIIt; + + MachineBasicBlock *jcc1MBB = nullptr; + + // If we have a double CMOV, we lower it to two successive branches to + // the same block. EFLAGS is used by both, so mark it as live in the second. + if (NextCMOV) { + jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, jcc1MBB); + jcc1MBB->addLiveIn(X86::EFLAGS); + } + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); @@ -18103,8 +18422,10 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - if (!MI->killsRegister(X86::EFLAGS) && - !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { + + MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -18115,7 +18436,19 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); + if (NextCMOV) { + // The fallthrough block may be jcc1MBB, if we have a double CMOV. + BB->addSuccessor(jcc1MBB); + + // In that case, jcc1MBB will itself fallthrough the copy0MBB, and + // jump to the sinkMBB. + jcc1MBB->addSuccessor(copy0MBB); + jcc1MBB->addSuccessor(sinkMBB); + } else { + BB->addSuccessor(copy0MBB); + } + + // The true block target of the first (or only) branch is always sinkMBB. BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. @@ -18123,6 +18456,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + if (NextCMOV) { + unsigned Opc2 = X86::GetCondBranchFromCond( + (X86::CondCode)NextCMOV->getOperand(3).getImm()); + BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); + } + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -18131,10 +18470,22 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(X86::PHI), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineInstrBuilder MIB = + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + // If we have a double CMOV, the second Jcc provides the same incoming + // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). + if (NextCMOV) { + MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); + // Copy the PHI result to the register defined by the second CMOV. + BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), + DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()); + NextCMOV->eraseFromParent(); + } MI->eraseFromParent(); // The pseudo instruction is gone now. return sinkMBB; @@ -18218,7 +18569,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = - Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); + Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -18303,7 +18654,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); + Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -19132,9 +19483,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // Note that even with AVX we prefer the PSHUFD form of shuffle for integer // vectors because it can have a load folded into it that UNPCK cannot. This // doesn't preclude something switching to the shorter encoding post-RA. - if (FloatDomain) { - if (Mask.equals(0, 0) || Mask.equals(1, 1)) { - bool Lo = Mask.equals(0, 0); + // + // FIXME: Should teach these routines about AVX vector widths. + if (FloatDomain && VT.getSizeInBits() == 128) { + if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { + bool Lo = Mask.equals({0, 0}); unsigned Shuffle; MVT ShuffleVT; // Check if we have SSE3 which will let us use MOVDDUP. That instruction @@ -19163,8 +19516,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, return true; } if (Subtarget->hasSSE3() && - (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { - bool Lo = Mask.equals(0, 0, 2, 2); + (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { + bool Lo = Mask.equals({0, 0, 2, 2}); unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19177,8 +19530,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, /*AddTo*/ true); return true; } - if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { - bool Lo = Mask.equals(0, 0, 1, 1); + if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { + bool Lo = Mask.equals({0, 0, 1, 1}); unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19196,12 +19549,12 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && - (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || - Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, - 15))) { + if (!FloatDomain && VT.getSizeInBits() == 128 && + (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || + Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals( + {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { bool Lo = Mask[0] == 0; unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -19237,9 +19590,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // in practice PSHUFB tends to be *very* fast so we're more aggressive. if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector<SDValue, 16> PSHUFBMask; - assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); - int Ratio = 16 / Mask.size(); - for (unsigned i = 0; i < 16; ++i) { + int NumBytes = VT.getSizeInBits() / 8; + int Ratio = NumBytes / Mask.size(); + for (int i = 0; i < NumBytes; ++i) { if (Mask[i / Ratio] == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; @@ -19249,12 +19602,13 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, : 255; PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); } - Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); + Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); - Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); @@ -19312,10 +19666,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return false; // Bail if we hit a non-vector. - // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit - // version should be added. - if (VT.getSizeInBits() != 128) - return false; assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); @@ -19418,12 +19768,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); (void)HaveMask; assert(HaveMask); + // If we have more than 128-bits, only the low 128-bits of shuffle mask + // matter. Check that the upper masks are repeats and remove them. + if (VT.getSizeInBits() > 128) { + int LaneElts = 128 / VT.getScalarSizeInBits(); +#ifndef NDEBUG + for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) + for (int j = 0; j < LaneElts; ++j) + assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && + "Mask doesn't repeat in high 128-bit lanes!"); +#endif + Mask.resize(LaneElts); + } + switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; @@ -19496,7 +19860,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + if (V.getSimpleValueType().getScalarType() != MVT::i8 && + V.getSimpleValueType().getScalarType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -19670,8 +20035,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT == MVT::v8i16); - (void)VT; + assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -19679,17 +20043,18 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. - if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + V = DAG.getNode(ISD::BITCAST, DL, DVT, V); DCI.AddToWorklist(V.getNode()); - V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DAG)); DCI.AddToWorklist(V.getNode()); - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. @@ -19717,18 +20082,14 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; - const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; - const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; - if (std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackLoMask)) || - std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackHiMask))) { + if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || + makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V, V); + DL, VT, V, V); } } } @@ -19876,10 +20237,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } } - // Only handle 128 wide vector from here on. - if (!VT.is128BitVector()) - return SDValue(); - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. @@ -20987,6 +21344,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); } +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + if (Cond->getOpcode() == X86ISD::CMP) { + ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); + if (!CondOp1C || !CondOp1C->isNullValue()) + return false; + + Cond = Cond->getOperand(0); + } + + isAnd = false; + + SDValue SetCC0, SetCC1; + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -21156,6 +21556,44 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags}; + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + return SDValue(); } @@ -21166,24 +21604,16 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, default: return SDValue(); // SSE/AVX/AVX2 blend intrinsics. case Intrinsic::x86_avx2_pblendvb: - case Intrinsic::x86_avx2_pblendw: - case Intrinsic::x86_avx2_pblendd_128: - case Intrinsic::x86_avx2_pblendd_256: // Don't try to simplify this intrinsic if we don't have AVX2. if (!Subtarget->hasAVX2()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_avx_blend_pd_256: - case Intrinsic::x86_avx_blend_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx_blendv_ps_256: // Don't try to simplify this intrinsic if we don't have AVX. if (!Subtarget->hasAVX()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_sse41_pblendw: - case Intrinsic::x86_sse41_blendpd: - case Intrinsic::x86_sse41_blendps: case Intrinsic::x86_sse41_blendvps: case Intrinsic::x86_sse41_blendvpd: case Intrinsic::x86_sse41_pblendvb: { @@ -21640,7 +22070,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // an and with a mask. // We'd like to try to combine that into a shuffle with zero // plus a bitcast, removing the and. - if (N0.getOpcode() != ISD::BITCAST || + if (N0.getOpcode() != ISD::BITCAST || N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); @@ -21670,7 +22100,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, unsigned ResSize = N1.getValueType().getScalarSizeInBits(); // Make sure the splat matches the mask we expect - if (SplatBitSize > ResSize || + if (SplatBitSize > ResSize || (SplatValue + 1).exactLogBase2() != (int)SrcSize) return SDValue(); @@ -21724,12 +22154,10 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget); - if (Zext.getNode()) + if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) return Zext; - SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; EVT VT = N->getValueType(0); @@ -22521,7 +22949,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - CommuteVectorShuffleMask(RMask, NumElts); + ShuffleVectorSDNode::commuteMask(RMask); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask @@ -22630,7 +23058,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); - + return SDValue(); } @@ -22864,45 +23292,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), - RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), - addV, DAG.getConstant(0, addV.getValueType()), CC); + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, + DAG.getConstant(0, addV.getValueType()), CC); } - if (VT.getScalarType() == MVT::i1) { - bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); - bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); - if (!IsSEXT0 && !IsVZero0) - return SDValue(); - bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && - (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + if (VT.getScalarType() == MVT::i1 && + (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { + bool IsSEXT0 = + (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!IsSEXT1 && !IsVZero1) - return SDValue(); + if (!IsSEXT0 || !IsVZero1) { + // Swap the operands and update the condition code. + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + + IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + } if (IsSEXT0 && IsVZero1) { - assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) + assert(VT == LHS.getOperand(0).getValueType() && + "Uexpected operand type"); + if (CC == ISD::SETGT) + return DAG.getConstant(0, VT); + if (CC == ISD::SETLE) + return DAG.getConstant(1, VT); + if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); + + assert((CC == ISD::SETNE || CC == ISD::SETLT) && + "Unexpected condition code!"); return LHS.getOperand(0); } - if (IsSEXT1 && IsVZero0) { - assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETEQ) - return DAG.getNOT(DL, RHS.getOperand(0), VT); - return RHS.getOperand(0); - } } return SDValue(); @@ -22940,7 +23374,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, // countS and just gets an f32 from that address. unsigned DestIndex = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; - + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); // Create this as a scalar to vector to match the instruction pattern. @@ -22964,7 +23398,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { // pattern-matching possibilities related to scalar math ops in SSE/AVX. // x86InstrInfo knows how to commute this back after instruction selection // if it would help register allocation. - + // TODO: If optimizing for size or a processor that doesn't suffer from // partial register update stalls, this should be transformed into a MOVSD // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. @@ -23503,27 +23937,23 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -namespace { - // Helper to match a string separated by whitespace. - bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { - s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. - - for (unsigned i = 0, e = args.size(); i != e; ++i) { - StringRef piece(*args[i]); - if (!s.startswith(piece)) // Check if the piece matches. - return false; +// Helper to match a string separated by whitespace. +static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { + S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. - s = s.substr(piece.size()); - StringRef::size_type pos = s.find_first_not_of(" \t"); - if (pos == 0) // We matched a prefix. - return false; + for (StringRef Piece : Pieces) { + if (!S.startswith(Piece)) // Check if the piece matches. + return false; - s = s.substr(pos); - } + S = S.substr(Piece.size()); + StringRef::size_type Pos = S.find_first_not_of(" \t"); + if (Pos == 0) // We matched a prefix. + return false; - return s.empty(); + S = S.substr(Pos); } - const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; + + return S.empty(); } static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { @@ -23563,12 +23993,12 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 - if (matchAsm(AsmPieces[0], "bswap", "$0") || - matchAsm(AsmPieces[0], "bswapl", "$0") || - matchAsm(AsmPieces[0], "bswapq", "$0") || - matchAsm(AsmPieces[0], "bswap", "${0:q}") || - matchAsm(AsmPieces[0], "bswapl", "${0:q}") || - matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { + if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || + matchAsm(AsmPieces[0], {"bswapl", "$0"}) || + matchAsm(AsmPieces[0], {"bswapq", "$0"}) || + matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); @@ -23577,8 +24007,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || - matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { + (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || + matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -23590,9 +24020,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && - matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && - matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && + matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && + matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -23607,9 +24037,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - if (matchAsm(AsmPieces[0], "bswap", "%eax") && - matchAsm(AsmPieces[1], "bswap", "%edx") && - matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && + matchAsm(AsmPieces[1], {"bswap", "%edx"}) && + matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 4423015..dd20ec2 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -30,37 +30,37 @@ namespace llvm { // Start the numbering where the builtin ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, - /// BSF - Bit scan forward. - /// BSR - Bit scan reverse. + /// Bit scan forward. BSF, + /// Bit scan reverse. BSR, - /// SHLD, SHRD - Double shift instructions. These correspond to + /// Double shift instructions. These correspond to /// X86::SHLDxx and X86::SHRDxx instructions. SHLD, SHRD, - /// FAND - Bitwise logical AND of floating point values. This corresponds + /// Bitwise logical AND of floating point values. This corresponds /// to X86::ANDPS or X86::ANDPD. FAND, - /// FOR - Bitwise logical OR of floating point values. This corresponds + /// Bitwise logical OR of floating point values. This corresponds /// to X86::ORPS or X86::ORPD. FOR, - /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// Bitwise logical XOR of floating point values. This corresponds /// to X86::XORPS or X86::XORPD. FXOR, - /// FANDN - Bitwise logical ANDNOT of floating point values. This + /// Bitwise logical ANDNOT of floating point values. This /// corresponds to X86::ANDNPS or X86::ANDNPD. FANDN, - /// FSRL - Bitwise logical right shift of floating point values. These + /// Bitwise logical right shift of floating point values. This /// corresponds to X86::PSRLDQ. FSRL, - /// CALL - These operations represent an abstract X86 call + /// These operations represent an abstract X86 call /// instruction, which includes a bunch of information. In particular the /// operands of these node are: /// @@ -79,8 +79,7 @@ namespace llvm { /// CALL, - /// RDTSC_DAG - This operation implements the lowering for - /// readcyclecounter + /// This operation implements the lowering for readcyclecounter RDTSC_DAG, /// X86 Read Time-Stamp Counter and Processor ID. @@ -131,187 +130,186 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, - /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, - /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + /// Repeat move, corresponds to X86::REP_MOVSx. REP_MOVS, - /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// On Darwin, this node represents the result of the popl /// at function entry, used for PIC code. GlobalBaseReg, - /// Wrapper - A wrapper node for TargetConstantPool, + /// A wrapper node for TargetConstantPool, /// TargetExternalSymbol, and TargetGlobalAddress. Wrapper, - /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// Special wrapper used under X86-64 PIC mode for RIP /// relative displacements. WrapperRIP, - /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector + /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. If you think this is too close to the previous /// mnemonic, so do I; blame Intel. MOVDQ2Q, - /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX + /// Copies a 32-bit value from the low word of a MMX /// vector to a GPR. MMX_MOVD2W, - /// MMX_MOVW2D - Copies a GPR into the low 32-bit word of a MMX vector + /// Copies a GPR into the low 32-bit word of a MMX vector /// and zero out the high word. MMX_MOVW2D, - /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, - /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, - /// INSERTPS - Insert any element of a 4 x float vector into any element + /// Insert any element of a 4 x float vector into any element /// of a destination 4 x floatvector. INSERTPS, - /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// Insert the lower 8-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRB. PINSRB, - /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, MMX_PINSRW, - /// PSHUFB - Shuffle 16 8-bit values within a vector. + /// Shuffle 16 8-bit values within a vector. PSHUFB, - /// ANDNP - Bitwise Logical AND NOT of Packed FP values. + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, - /// PSIGN - Copy integer sign. + /// Copy integer sign. PSIGN, - /// BLENDI - Blend where the selector is an immediate. + /// Blend where the selector is an immediate. BLENDI, - /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// Blend where the condition has been shrunk. /// This is used to emphasize that the condition mask is /// no more valid for generic VSELECT optimizations. SHRUNKBLEND, - /// ADDSUB - Combined add and sub on an FP vector. + /// Combined add and sub on an FP vector. ADDSUB, - // FADD, FSUB, FMUL, FDIV, FMIN, FMAX - FP vector ops with rounding mode. + // FP vector ops with rounding mode. FADD_RND, FSUB_RND, FMUL_RND, FDIV_RND, - // SUBUS - Integer sub with unsigned saturation. + // Integer sub with unsigned saturation. SUBUS, - /// HADD - Integer horizontal add. + /// Integer horizontal add. HADD, - /// HSUB - Integer horizontal sub. + /// Integer horizontal sub. HSUB, - /// FHADD - Floating point horizontal add. + /// Floating point horizontal add. FHADD, - /// FHSUB - Floating point horizontal sub. + /// Floating point horizontal sub. FHSUB, - /// UMAX, UMIN - Unsigned integer max and min. + /// Unsigned integer max and min. UMAX, UMIN, - /// SMAX, SMIN - Signed integer max and min. + /// Signed integer max and min. SMAX, SMIN, - /// FMAX, FMIN - Floating point max and min. - /// + /// Floating point max and min. FMAX, FMIN, - /// FMAXC, FMINC - Commutative FMIN and FMAX. + /// Commutative FMIN and FMAX. FMAXC, FMINC, - /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal - /// approximation. Note that these typically require refinement + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, - // TLSADDR - Thread Local Storage. + // Thread Local Storage. TLSADDR, - // TLSBASEADDR - Thread Local Storage. A call to get the start address + // Thread Local Storage. A call to get the start address // of the TLS block for the current module. TLSBASEADDR, - // TLSCALL - Thread Local Storage. When calling to an OS provided + // Thread Local Storage. When calling to an OS provided // thunk at the address from an earlier relocation. TLSCALL, - // EH_RETURN - Exception Handling helpers. + // Exception Handling helpers. EH_RETURN, - // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + // SjLj exception handling setjmp. EH_SJLJ_SETJMP, - // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + // SjLj exception handling longjmp. EH_SJLJ_LONGJMP, - /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for + /// Tail call return. See X86TargetLowering::LowerCall for /// the list of operands. TC_RETURN, - // VZEXT_MOVL - Vector move to low scalar and zero higher vector elements. + // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // VZEXT - Vector integer zero-extend. + // Vector integer zero-extend. VZEXT, - // VSEXT - Vector integer signed-extend. + // Vector integer signed-extend. VSEXT, - // VTRUNC - Vector integer truncate. + // Vector integer truncate. VTRUNC, - // VTRUNC - Vector integer truncate with mask. + // Vector integer truncate with mask. VTRUNCM, - // VFPEXT - Vector FP extend. + // Vector FP extend. VFPEXT, - // VFPROUND - Vector FP round. + // Vector FP round. VFPROUND, - // VSHL, VSRL - 128-bit vector logical left / right shift + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, - // VSHL, VSRL, VSRA - Vector shift elements + // Vector shift elements VSHL, VSRL, VSRA, - // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate + // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, - // CMPP - Vector packed double/float comparison. + // Vector packed double/float comparison. CMPP, - // PCMP* - Vector integer comparisons. + // Vector integer comparisons. PCMPEQ, PCMPGT, - // PCMP*M - Vector integer comparisons, the result is in a mask vector. + // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, - /// CMPM, CMPMU - Vector comparison generating mask bits for fp and + /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, CMPMU, - // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - BEXTR, // BEXTR - Bit field extract + BEXTR, // Bit field extract UMUL, // LOW, HI, FLAGS = umul LHS, RHS @@ -322,16 +320,16 @@ namespace llvm { UDIVREM8_ZEXT_HREG, SDIVREM8_SEXT_HREG, - // MUL_IMM - X86 specific multiply by immediate. + // X86-specific multiply by immediate. MUL_IMM, - // PTEST - Vector bitwise comparisons. + // Vector bitwise comparisons. PTEST, - // TESTP - Vector packed fp sign bitwise comparisons. + // Vector packed fp sign bitwise comparisons. TESTP, - // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector. + // Vector "test" in AVX-512, the result is in a mask vector. TESTM, TESTNM, @@ -697,6 +695,12 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + // FIXME: Map different constraints differently. + return InlineAsm::Constraint_m; + } + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On @@ -993,7 +997,8 @@ namespace llvm { bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicRMWExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 4923bc5..509602f 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,15 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", VTName)), VTName)); + + PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), + !if (!eq (EltSize, 64), "v8i64", "v16i32"), + VTName))), VTName)); + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); // The corresponding float type, e.g. v16f32 for v16i32 @@ -107,6 +116,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + + string ZSuffix = !if (!eq (Size, 128), "Z128", + !if (!eq (Size, 256), "Z256", "Z")); } def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; @@ -1559,6 +1571,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), !strconcat("vcmp", suffix, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + def rrib_alt: AVX512PIi8<0xC2, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + !strconcat("vcmp", suffix, + "\t{{sae}, $cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc, {sae}}"), + [], d>, EVEX_B; let mayLoad = 1 in def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), @@ -2047,6 +2064,8 @@ let Predicates = [HasVLX] in { (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), @@ -2062,177 +2081,193 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag, - RegisterClass KRC, RegisterClass RC, - ValueType vt, ValueType zvt, X86MemOperand memop, - Domain d, bit IsReMaterializable = 1> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + +multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag ld_frag, PatFrag mload, + bit IsReMaterializable = 1> { + let hasSideEffects = 0 in { + def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], - d>, EVEX; - def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), + _.ExeDomain>, EVEX; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; - } + "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>, + EVEX, EVEX_KZ; + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, SchedRW = [WriteLoad] in - def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src), + def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))], - d>, EVEX; - - let AddedComplexity = 20 in { - let Constraints = "$src0 = $dst", hasSideEffects = 0 in { - let hasSideEffects = 0 in - def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1), - !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", - "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt (vselect KRC:$mask, - (vt RC:$src1), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], + _.ExeDomain>, EVEX; + + let Constraints = "$src0 = $dst" in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K; let mayLoad = 1, SchedRW = [WriteLoad] in - def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, memop:$src1), + def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src1))), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT + (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; } let mayLoad = 1, SchedRW = [WriteLoad] in - def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src))), - (vt (bitconvert (zvt immAllZerosV))))))], - d>, EVEX, EVEX_KZ; + def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# + "${dst} {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + _.ExeDomain>, EVEX, EVEX_KZ; } + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), + (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0, + _.KRCWM:$mask, addr:$ptr)>; } -multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, - string elty, string elsz, string vsz512, - string vsz256, string vsz128, Domain d, - Predicate prd, bit IsReMaterializable = 1> { +multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { let Predicates = [prd] in - defm Z : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<ValueType>("v"##vsz512##elty##elsz), v16i32, - !cast<X86MemOperand>(elty##"512mem"), d, - IsReMaterializable>, EVEX_V512; + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, + masked_load_aligned512, IsReMaterializable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz256##elty##elsz, "v4i64")), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<ValueType>("v"##vsz256##elty##elsz), v8i32, - !cast<X86MemOperand>(elty##"256mem"), d, - IsReMaterializable>, EVEX_V256; - - defm Z128 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz128##elty##elsz, "v2i64")), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<ValueType>("v"##vsz128##elty##elsz), v4i32, - !cast<X86MemOperand>(elty##"128mem"), d, - IsReMaterializable>, EVEX_V128; + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, + masked_load_aligned256, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, + masked_load_aligned128, IsReMaterializable>, EVEX_V128; } } +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V512; -multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, - ValueType OpVT, RegisterClass KRC, RegisterClass RC, - X86MemOperand memop, Domain d> { + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V128; + } +} + +multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag st_frag, PatFrag mstore> { let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, - EVEX; + def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", [], + _.ExeDomain>, EVEX; let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, - EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; + def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), + OpcodeStr # + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + [], _.ExeDomain>, EVEX, EVEX_K; + def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # + "\t{$src, ${dst} {${mask}} {z}|" # + "${dst} {${mask}} {z}, $src}", + [], _.ExeDomain>, EVEX, EVEX_KZ; } let mayStore = 1 in { - def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX; + [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX; def mrk : AVX512PI<opc, MRMDestMem, (outs), - (ins memop:$dst, KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [], d>, EVEX, EVEX_K; + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K; } + + def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), + (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr, + _.KRCWM:$mask, _.RC:$src)>; } -multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat, - string st_suff_512, string st_suff_256, - string st_suff_128, string elty, string elsz, - string vsz512, string vsz256, string vsz128, - Domain d, Predicate prd> { +multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512), - !cast<ValueType>("v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512; + defm Z : avx512_store<opc, OpcodeStr, _.info512, store, + masked_store_unaligned>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256), - !cast<ValueType>("v"##vsz256##elty##elsz), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256; - - defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128), - !cast<ValueType>("v"##vsz128##elty##elsz), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128; + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, + masked_store_unaligned>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, + masked_store_unaligned>, EVEX_V128; } } -defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x29, "vmovaps", "alignedstore", - "512", "256", "", "f", "32", "16", "8", "4", - SSEPackedSingle, HasAVX512>, - PS, EVEX_CD8<32, CD8VF>; +multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, + masked_store_aligned512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, + masked_store_aligned256>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, + masked_store_aligned128>, EVEX_V128; + } +} -defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - avx512_store_vl<0x29, "vmovapd", "alignedstore", - "512", "256", "", "f", "64", "8", "4", "2", - SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, +defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, PS, EVEX_CD8<32, CD8VF>; -defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512, 0>, - avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), @@ -2276,6 +2311,7 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2285,73 +2321,36 @@ def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), - (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), - (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, - (bc_v16f32 (v16i32 immAllZerosV)))), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), - (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8f64 (v16i32 immAllZerosV)))), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), - (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; +} -defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", - "512", "256", "", "i", "32", "16", "8", "4", - SSEPackedInt, HasAVX512>, - PD, EVEX_CD8<32, CD8VF>; - -defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa64", "alignedstore", - "512", "256", "", "i", "64", "8", "4", "2", - SSEPackedInt, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8", - "64", "32", "16", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "", - "i", "8", "64", "32", "16", SSEPackedInt, +defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI>, XD, EVEX_CD8<8, CD8VF>; -defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16", - "32", "16", "8", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "", - "i", "16", "32", "16", "8", SSEPackedInt, +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; -defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "", - "i", "32", "16", "8", "4", SSEPackedInt, +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, XS, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "", - "i", "64", "8", "4", "2", SSEPackedInt, +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, @@ -2389,37 +2388,8 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), - (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8i64 (v16i32 immAllZerosV)))), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), - (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), - (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), - (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -// SKX replacement -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; - -// KNL replacement +// NoVLX patterns +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2428,7 +2398,7 @@ def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - +} // Move Int Doubleword to Packed Double Int // @@ -3243,28 +3213,95 @@ defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic //===----------------------------------------------------------------------===// +multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, OpndItins itins, + bit IsCommutable> { -multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - SizeItins itins> { - defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32X, - f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG, - EVEX_CD8<32, CD8VT1>; - defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64X, - f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG, - EVEX_CD8<64, CD8VT1>; + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT)), + "", itins.rr, IsCommutable>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)), + "", itins.rm, IsCommutable>; + let isCodeGenOnly = 1, isCommutable = IsCommutable, + Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], + itins.rr>; + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))], itins.rr>; + } } -let isCommutable = 1 in { -defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>; -defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>; -defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>; -defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>; +multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$rc)), "", itins.rr, IsCommutable>, + EVEX_B, EVEX_RC; } -let isCommutable = 0 in { -defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>; -defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; +multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; } +multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>; +defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>; +defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>; + multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -3411,15 +3448,27 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + let mayLoad = 1 in defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))), + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } +multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let mayLoad = 1 in + defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B; +} + multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { // src2 is always 128-bit defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, @@ -3430,46 +3479,95 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, + EVEX_4V; } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { - defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512; + ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info512>, EVEX_V512, + EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info256>, EVEX_V256, + EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info128>, EVEX_V128, + EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; + } } -multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr, - SDNode OpNode> { +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, + string OpcodeStr, SDNode OpNode> { defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, - v16i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info, HasAVX512>; defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, - v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16, + avx512vl_i16_info, HasBWI>; +} + +multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, EVEX_V256; + defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, EVEX_V128; + } } -defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_w<bits<8> opcw, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX, HasBWI] in { + defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v16i16x_info>, EVEX_V256; + defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v8i16x_info>, EVEX_V128; + } +} -defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - v16i32_info>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +} -defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>; -defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; -defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; -defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>; + +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x73, MRM4r, MRM4m, "vpsra", X86vsrai>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>; + +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts @@ -3481,29 +3579,71 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V; + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; } +multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _> { - defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } } multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, - avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info>; defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, - avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info>, VEX_W; } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; +multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>, + EVEX_V512, VEX_W; + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + } +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, + avx512_var_shift_w<0x12, "vpsllvw", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, + avx512_var_shift_w<0x11, "vpsravw", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, + avx512_var_shift_w<0x10, "vpsrlvw", srl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP @@ -4919,81 +5059,74 @@ defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations -multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag GatherNode> { -let mayLoad = 1, hasTwoExplicitDefs = 1, +multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayLoad = 1, Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in - def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), - (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), + (ins RC:$src1, KRC:$mask, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [(set _.RC:$dst, _.KRCWM:$mask_wb, - (_.VT (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, - vectoraddr:$src2)))]>, EVEX, EVEX_K, - EVEX_CD8<_.EltSize, CD8VT1>; + []>, EVEX, EVEX_K; } let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; } let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; } -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, - mgatherv16i32>, EVEX_V512; - -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; -multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag ScatterNode> { +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { let mayStore = 1, Constraints = "$mask = $mask_wb" in - - def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), - (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), + def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), + (ins memop:$dst, KRC:$mask, RC:$src2), !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), - _.KRCWM:$mask, vectoraddr:$dst))]>, - EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + []>, EVEX, EVEX_K; } let ExeDomain = SSEPackedDouble in { -defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; } let ExeDomain = SSEPackedSingle in { -defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem, - mscatterv16i32>, EVEX_V512; -defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; } -defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem, - mscatterv16i32>, EVEX_V512; +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; -defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; // prefetch multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index bf515a8..0bdabdf 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -282,6 +282,8 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN", SDTFPBinOpRound>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -304,8 +306,6 @@ def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; -def X86mgather : SDNode<"X86ISD::GATHER", SDTypeProfile<1, 3, - [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -526,58 +526,6 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; -def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v8i32 || - // Mgt->getBasePtr().getValueType() == MVT::v8i32); - //return false; - return N != 0; -}]>; - -def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v8i64 || - // Mgt->getBasePtr().getValueType() == MVT::v8i64); - //return false; - return N != 0; -}]>; -def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - // return (Mgt->getIndex().getValueType() == MVT::v16i32 || - // Mgt->getBasePtr().getValueType() == MVT::v16i32); - //return false; - return N != 0; -}]>; - -def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v8i32 || - // Sc->getBasePtr().getValueType() == MVT::v8i32); - //return false; - return N != 0; -}]>; - -def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v8i64 || - // Sc->getBasePtr().getValueType() == MVT::v8i64); - //return false; - return N != 0; -}]>; -def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - // return (Sc->getIndex().getValueType() == MVT::v16i32 || - // Sc->getBasePtr().getValueType() == MVT::v16i32); - //return false; - return N != 0; -}]>; - // 128-bit bitconvert pattern fragments def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; @@ -681,3 +629,55 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, return X86::isVINSERT256Index(N); }], INSERT_get_vinsert256_imm>; +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16; + return false; +}]>; + +def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32; + return false; +}]>; + +def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedLoadSDNode>(N)) + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64; + return false; +}]>; + +def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return (dyn_cast<MaskedLoadSDNode>(N) != 0); +}]>; + +def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16; + return false; +}]>; + +def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32; + return false; +}]>; + +def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (dyn_cast<MaskedStoreSDNode>(N)) + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64; + return false; +}]>; + +def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (dyn_cast<MaskedStoreSDNode>(N) != 0); +}]>; + diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f5b9680..538ec1c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -104,7 +104,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), - Subtarget(STI), RI(STI) { + Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -4573,9 +4573,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - SmallVector<unsigned, 8> Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); if (FoldMI) { FoldAsLoadDefReg = 0; return FoldMI; @@ -4670,7 +4668,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. @@ -4697,9 +4695,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, return MIB; } -static MachineInstr *FuseInst(MachineFunction &MF, - unsigned Opcode, unsigned OpNo, - const SmallVectorImpl<MachineOperand> &MOs, +static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, + unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), @@ -4723,7 +4720,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, } static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI) { MachineFunction &MF = *MI->getParent()->getParent(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); @@ -4736,12 +4733,12 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, unsigned OpNum, - const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Align, - bool AllowCommute) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -5104,10 +5101,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, MI->addRegisterKilled(Reg, TRI, true); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { // Check switch flag if (NoFusing) return nullptr; @@ -5145,10 +5142,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, } else if (Ops.size() != 1) return nullptr; - SmallVector<MachineOperand,4> MOs; - MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, - Size, Alignment, /*AllowCommute=*/true); + return foldMemoryOperandImpl(MF, MI, Ops[0], + MachineOperand::CreateFI(FrameIndex), Size, + Alignment, /*AllowCommute=*/true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, @@ -5170,9 +5166,9 @@ static bool isPartialRegisterLoad(const MachineInstr &LoadMI, return false; } -MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + ArrayRef<unsigned> Ops, MachineInstr *LoadMI) const { // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); @@ -5295,8 +5291,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return nullptr; // Folding a normal load. Just copy the load's address operands. - for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) - MOs.push_back(LoadMI->getOperand(i)); + MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands, + LoadMI->operands_begin() + NumOps); break; } } @@ -5304,9 +5300,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, /*Size=*/0, Alignment, /*AllowCommute=*/true); } - bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { + ArrayRef<unsigned> Ops) const { // Check switch flag if (NoFusing) return 0; @@ -5559,7 +5554,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, } if (Load) BeforeOps.push_back(SDValue(Load, 0)); - std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); + BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end()); SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 4d15467..0dd8101 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -305,23 +305,21 @@ public: /// folding and return true, otherwise it should return false. If it folds /// the instruction, it is likely that the MachineInstruction the iterator /// references has been changed. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, int FrameIndex) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific /// stack slot. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const override; /// canFoldMemoryOperand - Returns true if the specified load / store is /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr*, - const SmallVectorImpl<unsigned> &) const override; + bool canFoldMemoryOperand(const MachineInstr *, + ArrayRef<unsigned>) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is @@ -406,10 +404,9 @@ public: void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned OpNum, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, unsigned Size, unsigned Alignment, bool AllowCommute) const; diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 9881caf..e9a0431 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -572,10 +572,13 @@ def X86GR32orGR64AsmOperand : AsmOperandClass { def GR32orGR64 : RegisterOperand<GR32> { let ParserMatchClass = X86GR32orGR64AsmOperand; } - +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; } // Sign-extended immediate classes. We don't need to define the full lattice @@ -713,9 +716,6 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; -def vectoraddr : ComplexPattern<iPTR, 5, "SelectAddr", [],[SDNPWantParent]>; -//def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; - //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def HasCMov : Predicate<"Subtarget->hasCMov()">; @@ -855,11 +855,11 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ return (Imm == X86::COND_E) || (Imm == X86::COND_NE); }]>; -let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. - def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; - def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; - def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; -} + +def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d2929d2..ccdbf0e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3567,7 +3567,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, f32mem, ssmem, sse_load_f32, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - itins, HasAVX, "SS">, XS, VEX_4V, VEX_LIG; + itins, UseAVX, "SS">, XS, VEX_4V, VEX_LIG; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3579,7 +3579,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, f64mem, sdmem, sse_load_f64, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, itins, HasAVX, "SD">, XD, VEX_4V, VEX_LIG; + OpNode, itins, UseAVX, "SD">, XD, VEX_4V, VEX_LIG; } // Square root. @@ -4077,7 +4077,7 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4123,7 +4123,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { } } // Predicates = [HasAVX] -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -5902,7 +5902,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; // On AVX2, we also support 256bit inputs. - // FIXME: remove these patterns when the old shuffle lowering goes away. def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), @@ -6955,6 +6954,34 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + itins.rr>, Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6963,26 +6990,24 @@ let Predicates = [HasAVX] in { } let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } - defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V; + defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, @@ -7004,9 +7029,9 @@ let Predicates = [HasAVX2] in { VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } - defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; + defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7016,16 +7041,16 @@ let Constraints = "$src1 = $dst" in { 1, SSE_MPSADBW_ITINS>; } let ExeDomain = SSEPackedSingle in - defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; + defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; let ExeDomain = SSEPackedDouble in - defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_BLEND_P>; + defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; + defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7116,32 +7141,12 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - - def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), - (imm:$mask))), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; - def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), - (imm:$mask))), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), - (imm:$mask))), - (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } // Patterns @@ -7260,17 +7265,6 @@ let Predicates = [UseSSE41] in { def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; - } let SchedRW = [WriteLoad] in { @@ -7840,9 +7834,9 @@ def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, WriteFShuffle256>, VEX_L; let Predicates = [HasAVX2] in -def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128, WriteLoad>, - VEX_L; +def VBROADCASTI128 : avx_broadcast_no_int<0x5A, "vbroadcasti128", VR256, + i128mem, v4i64, loadv2i64, + WriteLoad>, VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), @@ -8238,38 +8232,31 @@ let Predicates = [HasF16C] in { // AVX2 Instructions //===----------------------------------------------------------------------===// -/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate -multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { +/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, - VR128, loadv2i64, i128mem>; -defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, loadv4i64, i256mem>, VEX_L; - -def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), - imm:$mask)), - (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; -def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), - imm:$mask)), - (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; +defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem>, VEX_L; //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the @@ -8608,9 +8595,7 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), // def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, u8imm:$src2), - "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteShuffle256]>, VEX, VEX_L; let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index e436811..42256b2 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -175,8 +175,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_permd, INTR_TYPE_2OP, X86ISD::VPERMV, 0), - X86_INTRINSIC_DATA(avx2_permps, INTR_TYPE_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 6af59d4..cd3076d 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -77,8 +77,8 @@ namespace llvm { X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) { MF = &F; CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *MF->getSubtarget().getInstrInfo(), *MF->getSubtarget().getRegisterInfo(), - MF->getSubtarget(), MF->getContext())); + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo(), MF->getContext())); } void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index cab7ce8..06545bc 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -53,26 +54,26 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) - : X86GenRegisterInfo( - (STI.is64Bit() ? X86::RIP : X86::EIP), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), - (STI.is64Bit() ? X86::RIP : X86::EIP)), - Subtarget(STI) { +X86RegisterInfo::X86RegisterInfo(const Triple &TT) + : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), + (TT.isArch64Bit() ? X86::RIP : X86::EIP)) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. - Is64Bit = Subtarget.is64Bit(); - IsWin64 = Subtarget.isTargetWin64(); + Is64Bit = TT.isArch64Bit(); + IsWin64 = Is64Bit && TT.isOSWindows(); // Use a callee-saved register as the base pointer. These registers must // not conflict with any ABI requirements. For example, in 32-bit mode PIC // requires GOT in the EBX register before function calls via PLT GOT pointer. if (Is64Bit) { SlotSize = 8; - bool Use64BitReg = - Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + // This matches the simplified 32-bit pointer code in the data layout + // computation. + // FIXME: Should use the data layout? + bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32; StackPtr = Use64BitReg ? X86::RSP : X86::ESP; FramePtr = Use64BitReg ? X86::RBP : X86::EBP; BasePtr = Use64BitReg ? X86::RBX : X86::EBX; @@ -120,8 +121,9 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } -const TargetRegisterClass* -X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ +const TargetRegisterClass * +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { // Don't allow super-classes of GR8_NOREX. This class is only used after // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the @@ -161,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -172,9 +175,9 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; case 2: // Available for tailcall (not callee-saved GPRs). - if (Subtarget.isTargetWin64()) + if (IsWin64) return &X86::GR64_TCW64RegClass; - else if (Subtarget.is64Bit()) + else if (Is64Bit) return &X86::GR64_TCRegClass; const Function *F = MF.getFunction(); @@ -210,7 +213,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: - return Subtarget.is64Bit() ? 10 : 4; + return Is64Bit ? 10 : 4; case X86::VR64RegClassID: return 4; } @@ -218,8 +221,10 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); + bool CallsEHReturn = MF->getMMI().callsEHReturn(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { @@ -253,11 +258,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Is64Bit) return CSR_64_MostRegs_SaveList; break; + case CallingConv::X86_64_Win64: + return CSR_Win64_SaveList; + case CallingConv::X86_64_SysV: + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; default: break; } - bool CallsEHReturn = MF->getMMI().callsEHReturn(); if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -270,8 +280,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } -const uint32_t* -X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -308,6 +320,10 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { break; default: break; + case CallingConv::X86_64_Win64: + return CSR_Win64_RegMask; + case CallingConv::X86_64_SysV: + return CSR_64_RegMask; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -349,7 +365,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { CallingConv::ID CC = MF.getFunction()->getCallingConv(); - const uint32_t* RegMask = getCallPreservedMask(CC); + const uint32_t *RegMask = getCallPreservedMask(MF, CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( "Stack realignment in presence of dynamic allocas is not supported with" @@ -393,7 +409,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } - if (!Is64Bit || !Subtarget.hasAVX512()) { + if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); @@ -486,6 +502,24 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // FRAME_ALLOC uses a single offset, with no register. It only works in the + // simple FP case, and doesn't work with stack realignment. On 32-bit, the + // offset is from the traditional base pointer location. On 64-bit, the + // offset is from the SP at the end of the prologue, not the FP location. This + // matches the behavior of llvm.frameaddress. + if (Opc == TargetOpcode::FRAME_ALLOC) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int Offset; + if (IsWinEH) + Offset = static_cast<const X86FrameLowering *>(TFI) + ->getFrameIndexOffsetFromSP(MF, FrameIndex); + else + Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + FI.ChangeToImmediate(Offset); + return; + } + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. @@ -537,8 +571,9 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? FramePtr : StackPtr; } -unsigned X86RegisterInfo::getPtrSizedFrameRegister( - const MachineFunction &MF) const { +unsigned +X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 406b1fc..74edab9 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -20,14 +20,7 @@ #include "X86GenRegisterInfo.inc" namespace llvm { - class Type; - class TargetInstrInfo; - class X86Subtarget; - class X86RegisterInfo final : public X86GenRegisterInfo { -public: - const X86Subtarget &Subtarget; - private: /// Is64Bit - Is the target 64-bits. /// @@ -55,7 +48,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(const X86Subtarget &STI); + X86RegisterInfo(const Triple &TT); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -76,8 +69,9 @@ public: getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const override; - const TargetRegisterClass* - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. @@ -98,7 +92,8 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 61c0600..677e824 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -2014,7 +2014,7 @@ def : InstRW<[WriteFMADDr], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", // 4p forms. @@ -2031,7 +2031,7 @@ def : InstRW<[WriteFMADDm], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", // 4p forms. diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 7feabf6..ca8fc9c 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -62,8 +62,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, #ifndef NDEBUG // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, - X86::ECX, X86::EAX, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; assert(!isBaseRegConflictPossible(DAG, ClobberSet)); #endif @@ -228,8 +228,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, - X86::ECX, X86::ESI, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 4bde053..43d3895 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -37,10 +37,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<TargetLoweringObjectFileMachO>(); } - if (TT.isOSLinux()) - return make_unique<X86LinuxTargetObjectFile>(); + if (TT.isOSLinux() || TT.isOSNaCl()) + return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return make_unique<TargetLoweringObjectFileELF>(); + return make_unique<X86ELFTargetObjectFile>(); if (TT.isKnownWindowsMSVCEnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) @@ -94,9 +94,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(Triple(TT)), TT, CPU, FS, Options, + RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), - DL(computeDataLayout(Triple(TT))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 283858d..c9833ed 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -24,8 +24,6 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - // Calculates type size & alignment - const DataLayout DL; X86Subtarget Subtarget; mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; @@ -35,8 +33,6 @@ public: const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; - const DataLayout *getDataLayout() const override { return &DL; } - const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86Subtarget *getSubtargetImpl(const Function &F) const override; TargetIRAnalysis getTargetIRAnalysis() override; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 1d1c32e..d65d3b0 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -15,17 +15,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" using namespace llvm; using namespace dwarf; -X86_64MachoTargetObjectFile::X86_64MachoTargetObjectFile() - : TargetLoweringObjectFileMachO() { - SupportIndirectSymViaGOTPCRel = true; -} - const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI, @@ -52,28 +48,30 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, int64_t Offset) const { + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry // from a data section. In case there's an additional offset, then use // foo@GOTPCREL+4+<offset>. + unsigned FinalOff = Offset+MV.getConstant()+4; const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); - const MCExpr *Off = MCConstantExpr::Create(Offset+4, getContext()); + const MCExpr *Off = MCConstantExpr::Create(FinalOff, getContext()); return MCBinaryExpr::CreateAdd(Res, Off, getContext()); } +const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} + void -X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { +X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); } -const MCExpr * -X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( - const MCSymbol *Sym) const { - return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); -} - const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const { // We are looking for the difference of two symbols, need a subtraction @@ -97,14 +95,12 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( SubRHS->getPointerAddressSpace() != 0) return nullptr; - // Both ptrtoint instructions must wrap global variables: + // Both ptrtoint instructions must wrap global objects: // - Only global variables are eligible for image relative relocations. - // - The subtrahend refers to the special symbol __ImageBase, a global. - const GlobalVariable *GVLHS = - dyn_cast<GlobalVariable>(SubLHS->getPointerOperand()); - const GlobalVariable *GVRHS = - dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); - if (!GVLHS || !GVRHS) + // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable. + const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand()); + const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); + if (!GOLHS || !GVRHS) return nullptr; // We expect __ImageBase to be a global variable without a section, externally @@ -117,10 +113,10 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( return nullptr; // An image-relative, thread-local, symbol makes no sense. - if (GVLHS->isThreadLocal()) + if (GOLHS->isThreadLocal()) return nullptr; - return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang), + return MCSymbolRefExpr::Create(TM.getSymbol(GOLHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); } diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f745538..2e25fb2 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -19,8 +19,6 @@ namespace llvm { /// x86-64. class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: - X86_64MachoTargetObjectFile(); - const MCExpr * getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, @@ -33,20 +31,25 @@ namespace llvm { const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr * - getIndirectSymViaGOTPCRel(const MCSymbol *Sym, - int64_t Offset) const override; + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; }; - /// X86LinuxTargetObjectFile - This implementation is used for linux x86 - /// and x86-64. - class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - + /// \brief This implemenatation is used for X86 ELF targets that don't + /// have a further specialization. + class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF { /// \brief Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; + /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and + /// Native Client on x86 and x86-64. + class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + /// \brief This implementation is used for Windows targets on x86 and x86-64. class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { const MCExpr * |