diff options
Diffstat (limited to 'lib/Target/X86')
27 files changed, 1204 insertions, 828 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 73a0095..c89e738 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -67,12 +67,19 @@ private: SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out); - bool MatchInstruction(SMLoc IDLoc, + bool MatchInstruction(SMLoc IDLoc, unsigned &Kind, SmallVectorImpl<MCParsedAsmOperand*> &Operands, SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo, bool matchingInlineAsm = false); + unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands, + unsigned OperandNum, unsigned &NumMCOperands) { + return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, + NumMCOperands); + } + /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode. bool isSrcOp(X86Operand &Op); @@ -514,12 +521,13 @@ bool X86AsmParser::isDstOp(X86Operand &Op) { bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { RegNo = 0; - if (!isParsingIntelSyntax()) { - const AsmToken &TokPercent = Parser.getTok(); - assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!"); - StartLoc = TokPercent.getLoc(); + const AsmToken &PercentTok = Parser.getTok(); + StartLoc = PercentTok.getLoc(); + + // If we encounter a %, ignore it. This code handles registers with and + // without the prefix, unprefixed registers can occur in cfi directives. + if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) Parser.Lex(); // Eat percent token. - } const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) { @@ -1516,9 +1524,12 @@ bool X86AsmParser:: MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { - SmallVector<MCInst, 2> Insts; + unsigned Kind; unsigned ErrorInfo; - bool Error = MatchInstruction(IDLoc, Operands, Insts, ErrorInfo); + SmallVector<MCInst, 2> Insts; + + bool Error = MatchInstruction(IDLoc, Kind, Operands, Insts, + ErrorInfo); if (!Error) for (unsigned i = 0, e = Insts.size(); i != e; ++i) Out.EmitInstruction(Insts[i]); @@ -1526,7 +1537,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, } bool X86AsmParser:: -MatchInstruction(SMLoc IDLoc, +MatchInstruction(SMLoc IDLoc, unsigned &Kind, SmallVectorImpl<MCParsedAsmOperand*> &Operands, SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo, bool matchingInlineAsm) { @@ -1537,7 +1548,7 @@ MatchInstruction(SMLoc IDLoc, // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. - // Also, MatchInstructionImpl should do actually *do* the EmitInstruction + // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || @@ -1568,7 +1579,7 @@ MatchInstruction(SMLoc IDLoc, MCInst Inst; // First, try a direct match. - switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo, + switch (MatchInstructionImpl(Operands, Kind, Inst, OrigErrorInfo, isParsingIntelSyntax())) { default: break; case Match_Success: @@ -1585,9 +1596,6 @@ MatchInstruction(SMLoc IDLoc, Error(IDLoc, "instruction requires a CPU feature not currently enabled", EmptyRanges, matchingInlineAsm); return true; - case Match_ConversionFail: - return Error(IDLoc, "unable to convert operands to instruction", - EmptyRanges, matchingInlineAsm); case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; @@ -1619,14 +1627,19 @@ MatchInstruction(SMLoc IDLoc, Tmp[Base.size()] = Suffixes[0]; unsigned ErrorInfoIgnore; unsigned Match1, Match2, Match3, Match4; + unsigned tKind; - Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match1 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match1 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[1]; - Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match2 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match2 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[2]; - Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match3 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match3 == Match_Success) Kind = tKind; Tmp[Base.size()] = Suffixes[3]; - Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Match4 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore); + if (Match4 == Match_Success) Kind = tKind; // Restore the old token. Op->setTokenValue(Base); @@ -1677,8 +1690,10 @@ MatchInstruction(SMLoc IDLoc, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { + ArrayRef<SMRange> Ranges = matchingInlineAsm ? EmptyRanges : + Op->getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", - Op->getLocRange(), matchingInlineAsm); + Ranges, matchingInlineAsm); } // Recover location info for the operand if we know which was the problem. @@ -1730,7 +1745,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveWord(2, DirectiveID.getLoc()); else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); - else if (IDVal.startswith(".intel_syntax")) { + else if (IDVal.startswith(".att_syntax")) { + getParser().setAssemblerDialect(0); + return false; + } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); if (getLexer().isNot(AsmToken::EndOfStatement)) { if(Parser.getTok().getString() == "noprefix") { diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 5039887..f136927 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -44,7 +44,7 @@ void x86DisassemblerDebug(const char *file, dbgs() << file << ":" << line << ": " << s; } -const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) { +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) { const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); return MII->getName(Opcode); } @@ -95,8 +95,8 @@ const EDInstInfo *X86GenericDisassembler::getEDInfo() const { /// be a pointer to a MemoryObject. /// @param byte - A pointer to the byte to be read. /// @param address - The address to be read. -static int regionReader(void* arg, uint8_t* byte, uint64_t address) { - MemoryObject* region = static_cast<MemoryObject*>(arg); +static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { + const MemoryObject* region = static_cast<const MemoryObject*>(arg); return region->readByte(address, byte); } @@ -135,10 +135,10 @@ X86GenericDisassembler::getInstruction(MCInst &instr, int ret = decodeInstruction(&internalInstr, regionReader, - (void*)®ion, + (const void*)®ion, loggerFn, (void*)&vStream, - (void*)MII, + (const void*)MII, address, fMode); @@ -379,6 +379,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } switch (type) { + case TYPE_XMM32: + case TYPE_XMM64: case TYPE_XMM128: mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); return; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index 0c92912..af444d1 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -200,7 +200,7 @@ static void unconsumeByte(struct InternalInstruction* insn) { insn->readerCursor + offset); \ if (ret) \ return ret; \ - combined = combined | ((type)byte << ((type)offset * 8)); \ + combined = combined | ((uint64_t)byte << (offset * 8)); \ } \ *ptr = combined; \ insn->readerCursor += sizeof(type); \ @@ -719,7 +719,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) { * @return - 0 if the ModR/M could be read when needed or was not needed; * nonzero otherwise. */ -static int getID(struct InternalInstruction* insn, void *miiArg) { +static int getID(struct InternalInstruction* insn, const void *miiArg) { uint8_t attrMask; uint16_t instructionID; @@ -1621,10 +1621,10 @@ static int readOperands(struct InternalInstruction* insn) { */ int decodeInstruction(struct InternalInstruction* insn, byteReader_t reader, - void* readerArg, + const void* readerArg, dlog_t logger, void* loggerArg, - void* miiArg, + const void* miiArg, uint64_t startLoc, DisassemblerMode mode) { memset(insn, 0, sizeof(struct InternalInstruction)); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 797703f..05cbb4c 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -403,7 +403,7 @@ typedef uint8_t BOOL; * be read from. * @return - -1 if the byte cannot be read for any reason; 0 otherwise. */ -typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address); +typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address); /* * dlog_t - Type for the logging function that the consumer can provide to @@ -422,7 +422,7 @@ struct InternalInstruction { /* Reader interface (C) */ byteReader_t reader; /* Opaque value passed to the reader */ - void* readerArg; + const void* readerArg; /* The address of the next byte to read via the reader */ uint64_t readerCursor; @@ -561,10 +561,10 @@ struct InternalInstruction { */ int decodeInstruction(struct InternalInstruction* insn, byteReader_t reader, - void* readerArg, + const void* readerArg, dlog_t logger, void* loggerArg, - void* miiArg, + const void* miiArg, uint64_t startLoc, DisassemblerMode mode); @@ -579,7 +579,7 @@ void x86DisassemblerDebug(const char *file, unsigned line, const char *s); -const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii); +const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii); #ifdef __cplusplus } diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 624e56f..4011035 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -941,3 +941,15 @@ and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the cost of reduced accuracy. //===---------------------------------------------------------------------===// + +This function should be matched to haddpd when the appropriate CPU is enabled: + +#include <x86intrin.h> +double f (__m128d p) { + return p[0] + p[1]; +} + +similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should +turn into hsubpd also. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 18e6b7c..d078a7b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -120,6 +120,9 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", "Support BMI2 instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", + "HasSlowDivide", "true", + "Use small divide for positive values less than 256">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -160,7 +163,8 @@ def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B, - FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>; + FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, + FeatureSlowDivide]>; // "Arrandale" along with corei3 and corei5 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureFastUAMem, diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index db71e27..a4785c9 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -233,12 +233,14 @@ void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { + raw_ostream &O, const char *Modifier, + unsigned AsmVariant) { const MachineOperand &MO = MI->getOperand(OpNo); switch (MO.getType()) { default: llvm_unreachable("unknown operand type!"); case MachineOperand::MO_Register: { - O << '%'; + // FIXME: Enumerating AsmVariant, so we can remove magic number. + if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { EVT VT = (strcmp(Modifier+6,"64") == 0) ? @@ -471,7 +473,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } } - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant); return false; } diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 35386cd..0062387 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -50,7 +50,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // These methods are used by the tablegen'erated instruction printer. void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = 0, unsigned AsmVariant = 0); void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index d705049..e202321 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -26,7 +26,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Function.h" #include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCExpr.h" @@ -134,8 +133,7 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { IsPIC = TM.getRelocationModel() == Reloc::PIC_; do { - DEBUG(dbgs() << "JITTing function '" - << MF.getFunction()->getName() << "'\n"); + DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n"); MCE.startFunction(MF); for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index e5952aa..54704d8 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -2014,13 +2014,17 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { MVT VT; if (!isTypeLegal(C->getType(), VT)) - return false; + return 0; + + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = NULL; switch (VT.SimpleTy) { - default: return false; + default: return 0; case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; @@ -2058,7 +2062,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { break; case MVT::f80: // No f80 support yet. - return false; + return 0; } // Materialize addresses with LEA instructions. diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 955c75a..9d5de81 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -171,6 +171,7 @@ namespace { // Shuffle live registers to match the expectations of successor blocks. void finishBlockStack(); +#ifndef NDEBUG void dumpStack() const { dbgs() << "Stack contents:"; for (unsigned i = 0; i != StackTop; ++i) { @@ -181,6 +182,7 @@ namespace { dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); dbgs() << "\n"; } +#endif /// getSlot - Return the stack slot number a particular register number is /// in. diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 27195b4..5fdc61e 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -100,6 +100,7 @@ namespace { Base_Reg = Reg; } +#ifndef NDEBUG void dump() { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; @@ -133,6 +134,7 @@ namespace { dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Align << '\n'; } +#endif }; } @@ -1011,7 +1013,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.IndexReg = ShVal.getNode()->getOperand(0); ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); - uint64_t Disp = AddVal->getSExtValue() << Val; + uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!FoldOffsetIntoAddress(Disp, AM)) return false; } @@ -2116,7 +2118,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Make sure that we don't change the operation by removing bits. // This only matters for OR and XOR, AND is unaffected. - if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) break; unsigned ShlOp, Op; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7954170..5c525ae 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -182,6 +182,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); + // Bypass i32 with i8 on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext())); + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -735,6 +739,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); @@ -824,6 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); @@ -857,6 +863,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); @@ -925,6 +932,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } if (Subtarget->hasSSE41()) { @@ -939,6 +948,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1016,19 +1028,25 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1052,7 +1070,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); - if (Subtarget->hasFMA()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Custom); setOperationAction(ISD::FMA, MVT::v4f64, Custom); setOperationAction(ISD::FMA, MVT::v4f32, Custom); @@ -2832,7 +2850,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); + ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3506,25 +3524,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } - static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; - static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; - const int *CompactionMask; - if (MatchEvenMask) - CompactionMask = CompactionMaskEven; - else if (MatchOddMask) - CompactionMask = CompactionMaskOdd; - else + if (!MatchEvenMask && !MatchOddMask) return SDValue(); - + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), - UndefNode, CompactionMask); - SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), - UndefNode, CompactionMask); - static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); + SDValue Op0 = SVOp->getOperand(0); + SDValue Op1 = SVOp->getOperand(1); + + if (MatchEvenMask) { + // Shift the second operand right to 32 bits. + static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; + Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); + } else { + // Shift the first operand left to 32 bits. + static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; + Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); + } + static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand @@ -4977,6 +4996,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); @@ -5881,8 +5912,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -5906,7 +5935,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (V2IsUndef) + + // As PSHUFB will zero elements with negative indices, it's safe to ignore + // the 2nd operand if it's undefined or zero. + if (V2.getOpcode() == ISD::UNDEF || + ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5992,6 +6025,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } +// v32i8 shuffles - Translate to VPSHUFB if possible. +static +SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG, + const X86TargetLowering &TLI) { + EVT VT = SVOp->getValueType(0); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // VPSHUFB may be generated if + // (1) one of input vector is undefined or zeroinitializer. + // The mask value 0x80 puts 0 in the corresponding slot of the vector. + // And (2) the mask indexes don't cross the 128-bit lane. + if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() || + (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) + return SDValue(); + + if (V1IsAllZero && !V2IsAllZero) { + CommuteVectorShuffleMask(MaskVals, 32); + V1 = V2; + } + SmallVector<SDValue, 32> pshufbMask; + for (unsigned i = 0; i != 32; i++) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 32) + EltIdx = 0x80; + else { + if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) + // Cross lane is not allowed. + return SDValue(); + EltIdx &= 0xf; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v32i8, &pshufbMask[0], 32)); +} + /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in @@ -6818,6 +6896,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } + if (VT == MVT::v32i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. if (NumElems == 4 && VT.is128BitVector()) @@ -8115,26 +8199,35 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::LowerFABS(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - Constant *C; - if (EltVT == MVT::f64) { - C = ConstantVector::getSplat(2, - ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - } else { - C = ConstantVector::getSplat(4, - ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); + NumElts = VT.getVectorNumElements(); } - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); + if (VT.isVector()) { + MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::AND, dl, ANDVT, + DAG.getNode(ISD::BITCAST, dl, ANDVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); + } return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8154,10 +8247,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { else C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); if (VT.isVector()) { MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, @@ -9943,62 +10037,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const Op.getOperand(1), Op.getOperand(2), DAG); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. - case Intrinsic::x86_mmx_pslli_w: - case Intrinsic::x86_mmx_pslli_d: - case Intrinsic::x86_mmx_pslli_q: - case Intrinsic::x86_mmx_psrli_w: - case Intrinsic::x86_mmx_psrli_d: - case Intrinsic::x86_mmx_psrli_q: - case Intrinsic::x86_mmx_psrai_w: - case Intrinsic::x86_mmx_psrai_d: { - SDValue ShAmt = Op.getOperand(2); - if (isa<ConstantSDNode>(ShAmt)) - return SDValue(); - - unsigned NewIntNo; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - } - - // The vector shift intrinsics with scalars uses 32b shift amounts but - // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits - // to be zero. - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, - DAG.getConstant(0, MVT::i32)); -// FIXME this must be lowered to get rid of the invalid type. - - EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(NewIntNo, MVT::i32), - Op.getOperand(1), ShAmt); - } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -10077,6 +10115,74 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + Opc = X86ISD::FMADD; + break; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + Opc = X86ISD::FMSUB; + break; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + Opc = X86ISD::FNMADD; + break; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + Opc = X86ISD::FNMSUB; + break; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + Opc = X86ISD::FMADDSUB; + break; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + Opc = X86ISD::FMSUBADD; + break; + } + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } } } @@ -10918,7 +11024,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); } // fall through case MVT::v4i32: @@ -14020,7 +14126,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // // where Op could be BRCOND or CMOV. // -static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) @@ -14056,40 +14162,133 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { if (SetCC.getOpcode() == ISD::ZERO_EXTEND) SetCC = SetCC.getOperand(0); - // Quit if not SETCC. - // FIXME: So far we only handle the boolean value generated from SETCC. If - // there is other ways to generate boolean values, we need handle them here - // as well. - if (SetCC.getOpcode() != X86ISD::SETCC) + switch (SetCC.getOpcode()) { + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + // A special case for rdrand, where 0 is set if false cond is found. + SDValue Op = SetCC.getOperand(0); + if (Op.getOpcode() != X86ISD::RDRAND) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } + } + + return SDValue(); +} + +/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS +/// updated. If only flag result is used and the result is evaluated from a +/// series of element extraction, try to combine it into a PTEST. +static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC, + SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDNode *N = Or.getNode(); + DebugLoc DL = N->getDebugLoc(); + + // Only SSE4.1 and beyond supports PTEST or like. + if (!Subtarget->hasSSE41()) return SDValue(); - // Set the condition code or opposite one if necessary. - CC = X86::CondCode(SetCC.getConstantOperandVal(0)); - if (needOppositeCond) - CC = X86::GetOppositeBranchCondition(CC); + if (N->getOpcode() != X86ISD::OR) + return SDValue(); - return SetCC.getOperand(1); -} + // Quit if the value result of OR is used. + if (N->hasAnyUseOfValue(0)) + return SDValue(); -static bool IsValidFCMOVCondition(X86::CondCode CC) { - switch (CC) { - default: - return false; - case X86::COND_B: - case X86::COND_BE: - case X86::COND_E: - case X86::COND_P: - case X86::COND_AE: - case X86::COND_A: - case X86::COND_NE: - case X86::COND_NP: - return true; + // Quit if not used as a boolean value. + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SmallVector<SDValue, 8> Opnds; + SDValue VecIn; + EVT VT = MVT::Other; + unsigned Mask = 0; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Check if all elements are extracted from the same vector. + SDValue ExtractedFromVec = I->getOperand(0); + if (VecIn.getNode() == 0) { + VT = ExtractedFromVec.getValueType(); + // FIXME: only 128-bit vector is supported so far. + if (!VT.is128BitVector()) + return SDValue(); + VecIn = ExtractedFromVec; + } else if (VecIn != ExtractedFromVec) + return SDValue(); + + // Record the constant index. + Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); } + + assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far."); + + // Quit if not all elements are used. + if (Mask != (1U << VT.getVectorNumElements()) - 1U) + return SDValue(); + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); // If the flag operand isn't dead, don't touch this CMOV. @@ -14114,10 +14313,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(Cond, CC); + Flags = checkBoolTestSetCCCombine(Cond, CC); if (Flags.getNode() && // Extra check as FCMOV only supports a subset of X86 cond. - (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) { + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { + SDValue Ops[] = { FalseOp, TrueOp, + DAG.getConstant(CC, MVT::i8), Flags }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), + Ops, array_lengthof(Ops)); + } + + Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget); + if (Flags.getNode()) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), @@ -15384,7 +15591,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes - // into FMINC and MMAXC, which are Commutative operations. + // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); @@ -15502,8 +15709,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); @@ -15525,9 +15737,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode; if (!NegMul) - Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); } @@ -15625,7 +15838,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT -static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -15641,7 +15856,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); + } + + Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); @@ -15663,7 +15884,14 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, + Flags); + } + + Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, @@ -15858,7 +16086,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); @@ -15888,7 +16116,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); - case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index b0c27c8..bfe9541 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -16,15 +16,18 @@ // // Return instructions. +// +// The X86retflag return instructions are variadic because we may add ST0 and +// ST1 arguments when returning values on the x87 stack. let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, FPForm = SpecialFP in { - def RET : I <0xC3, RawFrm, (outs), (ins), + def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), "ret", [(X86retflag 0)], IIC_RET>; def RETW : I <0xC3, RawFrm, (outs), (ins), "ret{w}", [], IIC_RET>, OpSize; - def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), + def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), "ret\t$amt", [(X86retflag timm:$amt)], IIC_RET_IMM>; def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 95ee7e5..5663800 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -19,7 +19,8 @@ let Constraints = "$src1 = $dst" in { multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, - SDPatternOperator Op = null_frag, bit MayLoad = 1> { + SDPatternOperator Op = null_frag> { + let isCommutable = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -27,7 +28,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -35,6 +36,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; + let isCommutable = 1 in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -42,7 +44,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -59,7 +61,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, SDNode Op, ValueType OpTy128, ValueType OpTy256> { defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy)), - MemFrag128, MemFrag256, OpTy128, OpTy256, Op, 0>; + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; let neverHasSideEffects = 1 in { defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy)), @@ -112,148 +114,18 @@ let ExeDomain = SSEPackedDouble in { v4f64>, VEX_W; } -let Predicates = [HasFMA] in { - def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMADDSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFMSUBADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMADDSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFMSUBADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMADDSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMADDSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFMSUBADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFMSUBADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMADDSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMADDSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFMSUBADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFMSUBADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFNMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, - (memopv4f32 addr:$src3)), - (VFNMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFNMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, - (memopv8f32 addr:$src3)), - (VFNMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFNMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, VR128:$src3), - (VFNMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, - (memopv2f64 addr:$src3)), - (VFNMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>; - - def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFNMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3), - (VFNMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, - (memopv4f64 addr:$src3)), - (VFNMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>; - -} // Predicates = [HasFMA] - let Constraints = "$src1 = $dst" in { multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, ValueType OpVT, PatFrag mem_frag, - SDPatternOperator OpNode = null_frag, bit MayLoad = 1> { + SDPatternOperator OpNode = null_frag> { + let isCommutable = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; - let mayLoad = MayLoad in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, @@ -266,6 +138,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop, ComplexPattern mem_cpat, Intrinsic IntId, RegisterClass RC> { + let isCommutable = 1 in def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -294,7 +167,7 @@ let neverHasSideEffects = 1 in { } defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)), - x86memop, RC, OpVT, mem_frag, OpNode, 0>, + x86memop, RC, OpVT, mem_frag, OpNode>, fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)), memop, mem_cpat, Int, RC>; } @@ -324,73 +197,102 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, //===----------------------------------------------------------------------===// -multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic Int> { - def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), +multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; - def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, memop:$src3), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4; - def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, memop:$src2, VR128:$src3), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; // For disassembler let isCodeGenOnly = 1 in - def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), + def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; } -multiclass fma4p<bits<8> opc, string OpcodeStr, - Intrinsic Int128, Intrinsic Int256, +multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { + let isCommutable = 1 in + def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, + mem_cpat:$src3))]>, VEX_W, MemOp4; + def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256> { + let isCommutable = 1 in def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_W, MemOp4; def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, + [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + let isCommutable = 1 in def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4; + (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_W, MemOp4; def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, + [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4; def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; + (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; // For disassembler let isCodeGenOnly = 1 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), @@ -406,45 +308,58 @@ let isCodeGenOnly = 1 in { let Predicates = [HasFMA4] in { -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; -defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps, - int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>; -defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd, - int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; -defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps, - int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd, - int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; -defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps, - int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>; -defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd, - int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; -defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps, - int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>; -defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd, - int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>; -defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps, - int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>; -defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd, - int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>; -defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps, - int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd, - int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>; +defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; +defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; +defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; +defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; +defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; +defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; +defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; +defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + +defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + memopv2f64, memopv4f64>; +defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + memopv4f32, memopv8f32>; +defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + memopv2f64, memopv4f64>; } // HasFMA4 diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 81b4f81..55ad2ec 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -287,12 +287,14 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } +def __xs : XS; + // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -303,7 +305,7 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -314,18 +316,25 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); } +// MMXPI - SSE 1 & 2 packed instructions with MMX operands +class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]); +} + // PIi8 - SSE 1 & 2 packed instructions with immediate class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : Ii8<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); @@ -341,18 +350,18 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, @@ -372,27 +381,31 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. // VSDI - SSE2 instructions with XD prefix in AVX form. // VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. +// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as +// MMX operands. +// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as +// MMX operands. class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; + : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>; + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>; class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> - : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, @@ -405,6 +418,12 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; +class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; +class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; // SSE3 Instruction Templates: // @@ -415,21 +434,23 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, - Requires<[HasSSE3]>; + Requires<[UseSSE3]>; // SSSE3 Instruction Templates: // // SS38I - SSSE3 instructions with T8 prefix. // SS3AI - SSSE3 instructions with TA prefix. +// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. +// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands. // // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version // uses the MMX registers. The 64-bit versions are grouped with the MMX @@ -438,10 +459,18 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSSE3]>; + Requires<[UseSSSE3]>; class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[UseSSSE3]>; +class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[HasSSSE3]>; +class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: @@ -452,11 +481,11 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; // SSE4.2 Instruction Templates: // @@ -464,9 +493,10 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSE42]>; + Requires<[UseSSE42]>; // SS42FI - SSE 4.2 instructions with T8XD prefix. +// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; @@ -475,7 +505,7 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - Requires<[HasSSE42]>; + Requires<[UseSSE42]>; // AVX Instruction Templates: // Instructions introduced in AVX (no SSE equivalent forms) diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index ee2d3c4..9035435 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -183,8 +183,8 @@ def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; -def X86Fmaddsub : SDNode<"X86ISD::FMSUBADD", SDTFma>; -def X86Fmsubadd : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -240,6 +240,10 @@ def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; +// 128-/256-bit extload pattern fragments +def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 459f01a..4f3d824 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1110,6 +1110,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 }, { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 }, // FIXME: add AVX 256-bit foldable instructions + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_16 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_16 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_16 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_16 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { @@ -1237,6 +1267,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_16 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_16 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_16 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_16 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -1786,10 +1846,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *MI = MBBI; MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src = MI->getOperand(1).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isKill = MI->getOperand(1).isKill(); + const MachineOperand &Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); MachineInstr *NewMI = NULL; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When @@ -1807,11 +1865,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); if (B != C) return 0; - unsigned A = MI->getOperand(0).getReg(); unsigned M = MI->getOperand(3).getImm(); NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addReg(A, RegState::Define | getDeadRegState(isDead)) - .addReg(B, getKillRegState(isKill)).addImm(M); + .addOperand(Dest).addOperand(Src).addImm(M); break; } case X86::SHUFPDrri: { @@ -1821,15 +1877,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); if (B != C) return 0; - unsigned A = MI->getOperand(0).getReg(); unsigned M = MI->getOperand(3).getImm(); // Convert to PSHUFD mask. M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addReg(A, RegState::Define | getDeadRegState(isDead)) - .addReg(B, getKillRegState(isKill)).addImm(M); + .addOperand(Dest).addOperand(Src).addImm(M); break; } case X86::SHL64ri: { @@ -1840,15 +1894,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ShAmt == 0 || ShAmt >= 4) return 0; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR64_NOSPRegClass)) return 0; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)) - .addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::SHL32ri: { @@ -1859,15 +1912,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ShAmt == 0 || ShAmt >= 4) return 0; // LEA can't handle ESP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR32_NOSPRegClass)) return 0; unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::SHL16ri: { @@ -1880,10 +1933,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(0).addImm(1 << ShAmt) - .addReg(Src, getKillRegState(isKill)) - .addImm(0).addReg(0); + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } default: { @@ -1906,14 +1957,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, RC)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) return 0; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, 1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), 1); break; } case X86::INC16r: @@ -1921,10 +1970,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, 1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); break; case X86::DEC64r: case X86::DEC32r: @@ -1936,14 +1983,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, RC)) + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) return 0; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, -1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), -1); break; } case X86::DEC16r: @@ -1951,10 +1996,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, -1); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); break; case X86::ADD64rr: case X86::ADD64rr_DB: @@ -1981,9 +2024,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return 0; NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, Src2, isKill2); + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); // Preserve undefness of the operands. bool isUndef = MI->getOperand(1).isUndef(); @@ -2003,9 +2045,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, Src2, isKill2); + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + if (LV && isKill2) LV->replaceKillInstruction(Src2, MI, NewMI); break; @@ -2015,10 +2063,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; case X86::ADD32ri: case X86::ADD32ri8: @@ -2026,10 +2073,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32ri8_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; } case X86::ADD16ri: @@ -2039,10 +2085,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addReg(Dest, RegState::Define | - getDeadRegState(isDead)), - Src, isKill, MI->getOperand(2).getImm()); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); break; } } @@ -2051,10 +2096,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (!NewMI) return 0; if (LV) { // Update live variables - if (isKill) - LV->replaceKillInstruction(Src, MI, NewMI); - if (isDead) - LV->replaceKillInstruction(Dest, MI, NewMI); + if (Src.isKill()) + LV->replaceKillInstruction(Src.getReg(), MI, NewMI); + if (Dest.isDead()) + LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); } MFI->insert(MBBI, NewMI); // Insert the new inst @@ -3444,6 +3489,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::FsFLD0SS: case X86::FsFLD0SD: return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); + case X86::AVX_SET0: + assert(HasAVX && "AVX not supported"); + return Expand2AddrUndef(MI, get(X86::VXORPSYrr)); + case X86::V_SETALLONES: + return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); + case X86::AVX2_SETALLONES: + return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; @@ -3557,14 +3609,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 - if (MI->getOpcode() == X86::MOV64r0) - NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI); - else if (MI->getOpcode() == X86::MOV32r0) - NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); - else if (MI->getOpcode() == X86::MOV16r0) - NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI); - else if (MI->getOpcode() == X86::MOV8r0) - NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI); + unsigned Opc = 0; + switch (MI->getOpcode()) { + default: break; + case X86::MOV64r0: Opc = X86::MOV64mi32; break; + case X86::MOV32r0: Opc = X86::MOV32mi; break; + case X86::MOV16r0: Opc = X86::MOV16mi; break; + case X86::MOV8r0: Opc = X86::MOV8mi; break; + } + if (Opc) + NewMI = MakeM0Inst(*this, Opc, MOs, MI); if (NewMI) return NewMI; @@ -3793,15 +3847,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = (*LoadMI->memoperands_begin())->getAlignment(); else switch (LoadMI->getOpcode()) { - case X86::AVX_SET0PSY: - case X86::AVX_SET0PDY: case X86::AVX2_SETALLONES: - case X86::AVX2_SET0: + case X86::AVX_SET0: Alignment = 32; break; case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SETALLONES: Alignment = 16; break; case X86::FsFLD0SD: @@ -3837,11 +3888,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, switch (LoadMI->getOpcode()) { case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SET0PSY: - case X86::AVX_SET0PDY: - case X86::AVX_SETALLONES: case X86::AVX2_SETALLONES: - case X86::AVX2_SET0: + case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. @@ -3873,15 +3921,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Ty = Type::getFloatTy(MF.getFunction()->getContext()); else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); - else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) - Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); - else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0) + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES || - Opc == X86::AVX2_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); @@ -3956,6 +4001,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, OpcodeTablePtr = &RegOp2MemOpTable1; } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (OpNum == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; } if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index d293156..304676d 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -114,7 +114,7 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, - [SDNPHasChain]>; + [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, [SDNPHasChain]>; def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, @@ -552,14 +552,21 @@ def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && Subtarget->hasNoAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && Subtarget->hasNoAVX()">; def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && Subtarget->hasNoAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && Subtarget->hasNoAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && Subtarget->hasNoAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && Subtarget->hasNoAVX()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index c8f40bb..bd54858 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -118,11 +118,11 @@ let Constraints = "$src1 = $dst" in { /// Unary MMX instructions requiring SSSE3. multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, Intrinsic IntId64, OpndItins itins> { - def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>; - def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, (IntId64 (bitconvert (memopmmx addr:$src))))], @@ -134,11 +134,11 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in { multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, Intrinsic IntId64, OpndItins itins> { let isCommutable = 0 in - def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>; - def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, @@ -149,11 +149,11 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, /// PALIGN MMX instructions (require SSSE3). multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { - def R64irr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; - def R64irm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, @@ -163,12 +163,10 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins, Domain d> { - def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (Int SrcRC:$src))], - itins.rr, d>; - def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (Int (ld_frag addr:$src)))], - itins.rm, d>; + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, @@ -243,29 +241,30 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), [(store (x86mmx VR64:$src), addr:$dst)], IIC_MMX_MOVQ_RM>; -def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), - (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (x86mmx (bitconvert - (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))))], - IIC_MMX_MOVQ_RR>; - -def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), - (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (scalar_to_vector - (i64 (bitconvert (x86mmx VR64:$src))))))], - IIC_MMX_MOVQ_RR>; +def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))))], + IIC_MMX_MOVQ_RR>; + +def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 + (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))], + IIC_MMX_MOVQ_RR>; let neverHasSideEffects = 1 in -def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), - (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; +def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; -def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), - (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; +def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", @@ -577,6 +576,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), IIC_MMX_MASKMOV>; // 64-bit bit convert. +let Predicates = [HasSSE2] in { def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), (MMX_MOVD64to64rr GR64:$src)>; def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), @@ -585,5 +585,6 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), (MMX_MOVFR642Qrr FR64:$src)>; +} diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 220c06d..17e91a6 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -251,35 +251,37 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), // A 128-bit subvector extract from the first 256-bit vector position // is a subregister copy that needs no instruction. -def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))), +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))), +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))), +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))), +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; -def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))), +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; -def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))), +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; // A 128-bit subvector insert to the first 256-bit vector position // is a subregister copy that needs no instruction. -def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), +let AddedComplexity = 25 in { // to give priority over vinsertf128rm +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +} // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), @@ -362,7 +364,7 @@ let Predicates = [HasAVX] in { def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; } -// Alias instructions that map fld0 to pxor for sse. +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1 in { @@ -382,11 +384,11 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, neverHasSideEffects = 1 in { -def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>; + isPseudo = 1 in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -def : Pat<(v4f32 immAllZerosV), (V_SET0)>; def : Pat<(v2f64 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; def : Pat<(v2i64 immAllZerosV), (V_SET0)>; @@ -394,35 +396,29 @@ def : Pat<(v8i16 immAllZerosV), (V_SET0)>; def : Pat<(v16i8 immAllZerosV), (V_SET0)>; -// The same as done above but for AVX. The 256-bit ISA does not support PI, +// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, // and doesn't need it because on sandy bridge the register is set to zero // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty -// FIXME: Change encoding to pseudo! This is blocked right now by the x86 -// JIT implementatioan, it does not expand the instructions below like -// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1 in { -let Predicates = [HasAVX] in { -def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; -def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; -} -let Predicates = [HasAVX2], neverHasSideEffects = 1 in -def AVX2_SET0 : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "", - []>, VEX_4V; + isPseudo = 1, Predicates = [HasAVX] in { +def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>; } -let Predicates = [HasAVX2], AddedComplexity = 5 in { - def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>; - def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>; +let Predicates = [HasAVX] in + def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; + +let Predicates = [HasAVX2] in { + def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; + def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; + def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; + def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; } -// AVX has no support for 256-bit integer instructions, but since the 128-bit +// AVX1 has no support for 256-bit integer instructions, but since the 128-bit // VPXOR instruction writes zero to its upper part, it's safe build zeros. +let Predicates = [HasAVX1Only] in { def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; @@ -438,22 +434,17 @@ def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +} // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. -// FIXME: Change encoding to pseudo! This is blocked right now by the x86 -// JIT implementation, it does not expand the instructions below like -// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt in { - let Predicates = [HasAVX] in - def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; - def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllOnesV))]>; + isPseudo = 1 in { + def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; let Predicates = [HasAVX2] in - def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "", - [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V; + def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; } @@ -605,27 +596,27 @@ let Predicates = [HasAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))), + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))), + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))), + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))), + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))), + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; // Move low f64 and clear high bits. @@ -704,7 +695,7 @@ let Predicates = [HasAVX] in { (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. @@ -738,7 +729,7 @@ let Predicates = [HasSSE1] in { (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. @@ -916,16 +907,16 @@ let isCodeGenOnly = 1 in { let Predicates = [HasAVX] in { def : Pat<(v8i32 (X86vzmovl - (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))), + (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl - (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))), + (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl - (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))), + (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl - (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))), + (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; } @@ -975,10 +966,10 @@ let Predicates = [HasAVX] in { (VMOVUPDmr addr:$dst, VR128:$src)>; } -let Predicates = [HasSSE1] in +let Predicates = [UseSSE1] in def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), (MOVUPSmr addr:$dst, VR128:$src)>; -let Predicates = [HasSSE2] in +let Predicates = [UseSSE2] in def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), (MOVUPDmr addr:$dst, VR128:$src)>; @@ -1028,12 +1019,52 @@ let Predicates = [HasAVX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } // Use movaps / movups for SSE integer load / store (one byte shorter). // The instructions selected below are then converted to MOVDQA/MOVDQU // during the SSE domain pass. -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(alignedloadv2i64 addr:$src), (MOVAPSrm addr:$src)>; def : Pat<(loadv2i64 addr:$src), @@ -1180,7 +1211,7 @@ let Predicates = [HasAVX] in { (VMOVLPDmr addr:$src1, VR128:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), @@ -1205,7 +1236,7 @@ let Predicates = [HasSSE1] in { (MOVLPSmr addr:$src1, VR128:$src2)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; @@ -1279,7 +1310,7 @@ let Predicates = [HasAVX] in { (VMOVHPDrm VR128:$src1, addr:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // MOVHPS patterns def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), @@ -1289,7 +1320,7 @@ let Predicates = [HasSSE1] in { (MOVHPSrm VR128:$src1, addr:$src2)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1346,7 +1377,7 @@ let Predicates = [HasAVX] in { (VMOVHLPSrr VR128:$src1, VR128:$src2)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { // MOVLHPS patterns def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr VR128:$src1, VR128:$src2)>; @@ -1456,7 +1487,7 @@ def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX] in { def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -1633,7 +1664,7 @@ defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - TB, Requires<[HasSSE2]>; + TB, Requires<[UseSSE2]>; /// SSE 2 Only @@ -1663,7 +1694,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fround (loadf64 addr:$src)))], IIC_SSE_CVT_Scalar_RM>, XD, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1684,13 +1715,13 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>; def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>; } // Convert scalar single to scalar double @@ -1709,30 +1740,28 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; } -let AddedComplexity = 1 in { // give AVX priority - def : Pat<(f64 (fextend FR32:$src)), - (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; +def : Pat<(f64 (fextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; - def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[HasAVX, OptForSpeed]>; -} // AddedComplexity = 1 +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + Requires<[HasAVX, OptForSpeed]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fextend FR32:$src))], IIC_SSE_CVT_Scalar_RR>, XS, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))], IIC_SSE_CVT_Scalar_RM>, XS, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; // extload f32 -> f64. This matches load+fextend because we have a hack in // the isel (PreprocessForFPConvert) that can introduce loads after dag @@ -1740,9 +1769,9 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), // Since these loads aren't folded into the fextend, we have to match it // explicitly here. def : Pat<(fextend (loadf32 addr:$src)), - (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>; + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; def : Pat<(extloadf32 addr:$src), - (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>; + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1762,13 +1791,13 @@ def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>; def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>; + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>; } // Convert packed single/double fp to doubleword @@ -1904,7 +1933,7 @@ let Predicates = [HasAVX] in { (VCVTTPS2DQYrm addr:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (CVTDQ2PSrr VR128:$src)>; def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), @@ -1978,10 +2007,10 @@ def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], IIC_SSE_CVT_PD_RR>, TB, VEX; -let neverHasSideEffects = 1, mayLoad = 1 in def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, VEX; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, @@ -1994,15 +2023,15 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), IIC_SSE_CVT_PD_RM>, TB, VEX; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], IIC_SSE_CVT_PD_RR>, TB; -let neverHasSideEffects = 1, mayLoad = 1 in def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>, TB; + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB; } // Convert Packed DW Integers to Packed Double FP @@ -2105,11 +2134,11 @@ let Predicates = [HasAVX] in { (VCVTPS2PDrr VR128:$src)>; def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), (VCVTPS2PDYrr VR128:$src)>; - def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), + def : Pat<(v4f64 (extloadv4f32 addr:$src)), (VCVTPS2PDYrm addr:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Match fextend for 128 conversions def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), (CVTPS2PDrr VR128:$src)>; @@ -2336,14 +2365,14 @@ def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), @@ -2420,7 +2449,7 @@ let Predicates = [HasAVX] in { (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; } -let Predicates = [HasSSE1] in { +let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -2428,7 +2457,7 @@ let Predicates = [HasSSE1] in { (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // Generic SHUFPD patterns def : Pat<(v2i64 (X86Shufp VR128:$src1, (memopv2i64 addr:$src2), (i8 imm:$imm))), @@ -2500,7 +2529,27 @@ let Constraints = "$src1 = $dst" in { SSEPackedDouble>, TB, OpSize; } // Constraints = "$src1 = $dst" -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX1Only] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + +let Predicates = [HasAVX] in { // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel @@ -2509,7 +2558,7 @@ let Predicates = [HasAVX], AddedComplexity = 1 in { (VUNPCKLPDrr VR128:$src, VR128:$src)>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel @@ -2578,16 +2627,16 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", def : Pat<(i32 (X86fgetsign FR32:$src)), (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; def : Pat<(i64 (X86fgetsign FR32:$src)), (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>, - Requires<[HasSSE1]>; + Requires<[UseSSE1]>; def : Pat<(i32 (X86fgetsign FR64:$src)), (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; def : Pat<(i64 (X86fgetsign FR64:$src)), (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>, - Requires<[HasSSE2]>; + Requires<[UseSSE2]>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions @@ -2683,14 +2732,12 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, } // Alias bitwise logical operations using SSE logical ops on packed FP values. -let mayLoad = 0 in { - defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, - SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; -} +defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; +defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + SSE_BIT_ITINS_P>; +defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, @@ -2794,27 +2841,23 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins, bit Is2Addr = 1> { - let mayLoad = 0 in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>, TB; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>, TB, OpSize; - } } multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { - let mayLoad = 0 in { - defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, + defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>, TB; - defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, + defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>, TB, OpSize; - } } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, @@ -2924,7 +2967,7 @@ let Constraints = "$src1 = $dst" in { } } -let isCommutable = 1, isCodeGenOnly = 1 in { +let isCodeGenOnly = 1 in { defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>, VEX_4V, VEX_LIG; defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>, @@ -2978,7 +3021,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[HasSSE1, OptForSize]>; + Requires<[UseSSE1, OptForSize]>; def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>; @@ -2992,7 +3035,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - let mayLoad = 1 in + let mayLoad = 1 in { def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; @@ -3000,6 +3043,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { (ins VR128:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + } } /// sse1_fp_unop_p - SSE1 unops in packed form. @@ -3062,7 +3106,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, - Requires<[HasSSE2, OptForSize]>; + Requires<[UseSSE2, OptForSize]>; def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>; @@ -3072,20 +3116,20 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, } /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. +let hasSideEffects = 0 in multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { - let neverHasSideEffects = 1 in { def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - let mayLoad = 1 in + let mayLoad = 1 in { def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; - } def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + } } /// sse2_fp_unop_p - SSE2 unops in vector forms. @@ -3176,7 +3220,6 @@ let Predicates = [HasAVX] in { SSE_RCPP>, VEX; } -let AddedComplexity = 1 in { def : Pat<(f32 (fsqrt FR32:$src)), (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -3199,9 +3242,8 @@ def : Pat<(f32 (X86frcp FR32:$src)), def : Pat<(f32 (X86frcp (load addr:$src))), (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX, OptForSize]>; -} -let Predicates = [HasAVX], AddedComplexity = 1 in { +let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), @@ -3322,7 +3364,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), IIC_SSE_MOVNT>; def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), - (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; + (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>; // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), @@ -3482,7 +3524,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; // For Disassembler let isCodeGenOnly = 1 in { @@ -3492,7 +3534,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; } let canFoldAsLoad = 1, mayLoad = 1 in { @@ -3504,7 +3546,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqu\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], IIC_SSE_MOVU_P_RM>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } let mayStore = 1 in { @@ -3516,7 +3558,7 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [/*(store (v2i64 VR128:$src), addr:$dst)*/], IIC_SSE_MOVU_P_MR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } // Intrinsic forms of MOVDQU load and store @@ -3530,7 +3572,7 @@ def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], IIC_SSE_MOVU_P_MR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } // ExeDomain = SSEPackedInt @@ -4028,7 +4070,7 @@ let Predicates = [HasAVX2] in { (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), @@ -4210,7 +4252,7 @@ let Predicates = [HasAVX2] in { defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX; } -let Predicates = [HasSSE2] in { +let Predicates = [UseSSE2] in { let AddedComplexity = 5 in defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize; @@ -4325,28 +4367,6 @@ let Constraints = "$src1 = $dst" in { } } // ExeDomain = SSEPackedInt -// Patterns for using AVX1 instructions with integer vectors -// Here to give AVX2 priority -let Predicates = [HasAVX] in { - def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), - (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), - (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), - (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), - (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - - def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), - (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), - (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; -} - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Extract and Insert //===---------------------------------------------------------------------===// @@ -4395,7 +4415,7 @@ let Predicates = [HasAVX] in { } let Constraints = "$src1 = $dst" in - defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>; + defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>; } // ExeDomain = SSEPackedInt @@ -4556,7 +4576,7 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), // Move Packed Doubleword Int first element to Doubleword Int // def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "vmov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, @@ -4672,14 +4692,14 @@ let Predicates = [HasAVX] in { } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))), + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))), + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } -let Predicates = [HasSSE2], AddedComplexity = 20 in { +let Predicates = [UseSSE2], AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (MOVZDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -4719,7 +4739,7 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))], IIC_SSE_MOVDQ>, XS, - Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix + Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix //===---------------------------------------------------------------------===// // Move Packed Quadword Int to Quadword Int @@ -4762,7 +4782,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; let Predicates = [HasAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), @@ -4773,7 +4793,7 @@ let Predicates = [HasAVX], AddedComplexity = 20 in { (VMOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasSSE2], AddedComplexity = 20 in { +let Predicates = [UseSSE2], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), @@ -4803,7 +4823,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -4818,7 +4838,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (loadv2i64 addr:$src))))], IIC_SSE_MOVDQ>, - XS, Requires<[HasSSE2]>; + XS, Requires<[UseSSE2]>; } let AddedComplexity = 20 in { @@ -4828,7 +4848,7 @@ let AddedComplexity = 20 in { def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (VMOVZPQILo2PQIrr VR128:$src)>; } - let Predicates = [HasSSE2] in { + let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVZPQILo2PQIrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), @@ -4908,7 +4928,7 @@ let Predicates = [HasAVX] in { (VMOVSLDUPYrm addr:$src)>; } -let Predicates = [HasSSE3] in { +let Predicates = [UseSSE3] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (MOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), @@ -4977,7 +4997,7 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } -let Predicates = [HasSSE3] in { +let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), @@ -5041,7 +5061,7 @@ let Predicates = [HasAVX] in { f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; } } -let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in { +let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, f128mem, SSE_ALU_F32P>, TB, XD; @@ -5424,7 +5444,7 @@ let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Predicates = [HasAVX2] in defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V; -let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in +let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palign<"palignr">; let Predicates = [HasAVX2] in { @@ -5449,7 +5469,7 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; } -let Predicates = [HasSSSE3] in { +let Predicates = [UseSSSE3] in { def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), @@ -5583,7 +5603,7 @@ let Predicates = [HasAVX] in { (VPMOVZXDQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load. def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), (PMOVSXBWrm addr:$src)>; @@ -5633,7 +5653,7 @@ let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; } @@ -5704,7 +5724,7 @@ let Predicates = [HasAVX] in { (VPMOVZXWQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), (PMOVSXBDrm addr:$src)>; @@ -5772,7 +5792,7 @@ let Predicates = [HasAVX] in { (VPMOVZXBQrm addr:$src)>; } -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq (bitconvert (v4i32 (X86vzmovl @@ -5918,7 +5938,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))), addr:$dst), (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasSSE41]>; + Requires<[UseSSE41]>; //===----------------------------------------------------------------------===// // SSE4.1 - Insert Instructions @@ -6190,6 +6210,15 @@ let Predicates = [HasAVX] in { (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; def : Pat<(f64 (ftrunc FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (ffloor VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x1))>; + def : Pat<(v8f32 (ffloor VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x1))>; + def : Pat<(v4f64 (ffloor VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x1))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6199,26 +6228,33 @@ let Constraints = "$src1 = $dst" in defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; -def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; -def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; -def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; -def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; -def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; -def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; -def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; -def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; -def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; -def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; +let Predicates = [UseSSE41] in { + def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x1))>; + def : Pat<(v2f64 (ffloor VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x1))>; +} //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test @@ -6356,7 +6392,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; } -/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, Intrinsic IntId256> { let isCommutable = 1 in @@ -6705,7 +6741,7 @@ def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; -let Predicates = [HasSSE41] in { +let Predicates = [UseSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), (v16i8 VR128:$src2))), (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; @@ -6802,9 +6838,8 @@ multiclass pseudo_pcmpistrm<string asm> { } let Defs = [EFLAGS], usesCustomInserter = 1 in { - let AddedComplexity = 1 in - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>; + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; } let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in { @@ -6840,9 +6875,8 @@ multiclass pseudo_pcmpestrm<string asm> { } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - let AddedComplexity = 1 in - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>; + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; } let Predicates = [HasAVX], @@ -7237,40 +7271,59 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), let Predicates = [HasAVX] in { def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +let Predicates = [HasAVX1Only] in { def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), - (i32 imm)), +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; } @@ -7290,56 +7343,61 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), []>, VEX; } -// Extract and store. -let Predicates = [HasAVX] in { - def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - - def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; - def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))), - (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; -} - // AVX1 patterns let Predicates = [HasAVX] in { -def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; -def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; -def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), - (VEXTRACTF128rr VR256:$src1, imm:$src2)>; - -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4f32 (VEXTRACTF128rr (v8f32 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2f64 (VEXTRACTF128rr (v4f64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + +def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTF128rr - (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTF128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTF128rr - (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTF128rr - (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -7456,29 +7514,29 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX] in { +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX1Only] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, - (memopv8f32 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, - (memopv4f64 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; @@ -7665,19 +7723,22 @@ let Predicates = [HasAVX2] in { } // AVX1 broadcast patterns -let Predicates = [HasAVX] in { +let Predicates = [HasAVX1Only] in { def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), (VBROADCASTSDYrm addr:$src)>; def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. @@ -7757,7 +7818,6 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // -let AddedComplexity = 1 in { def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -7768,9 +7828,8 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), (i8 imm:$src3)))]>, VEX_4V; -} -let Predicates = [HasAVX2], AddedComplexity = 1 in { +let Predicates = [HasAVX2] in { def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), @@ -7805,23 +7864,43 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), []>, VEX_4V; } -let Predicates = [HasAVX2], AddedComplexity = 1 in { +let Predicates = [HasAVX2] in { def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (i32 imm)), + (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; + +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (memopv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -7838,23 +7917,40 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; -let Predicates = [HasAVX2], AddedComplexity = 1 in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +let Predicates = [HasAVX2] in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTI128rr (v4i64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTI128rr (v8i32 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTI128rr (v16i16 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), +def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTI128rr (v32i8 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextractf128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index 7ac4cec..764aa5d 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -532,7 +532,7 @@ uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { #endif } -template<typename T> void addUnaligned(void *Pos, T Delta) { +template<typename T> static void addUnaligned(void *Pos, T Delta) { T Value; std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos), sizeof(T)); diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 9c0ce4e..1c2ef25 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -377,12 +377,6 @@ ReSimplify: case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; - case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; - case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; - case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; - case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break; - case X86::AVX2_SET0: LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 877b8f6..3b4cfc4 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -522,7 +522,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, RegScavenger *RS) const{ + int SPAdj, RegScavenger *RS) const { assert(SPAdj == 0 && "Unexpected"); unsigned i = 0; diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 9087852..0d7b664 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -346,6 +346,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasVectorUAMem(false) , HasCmpxchg16b(false) , UseLeaForSP(false) + , HasSlowDivide(false) , PostRAScheduler(false) , stackAlignment(4) // FIXME: this is a known good value for Yonah. How about others? diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 6841c5b..dde7e24 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -136,6 +136,10 @@ protected: /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// HasSlowDivide - True if smaller divides are significantly faster than + /// full divides and should be used when possible. + bool HasSlowDivide; + /// PostRAScheduler - True if using post-register-allocation scheduler. bool PostRAScheduler; @@ -198,6 +202,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasNoAVX() const { return X86SSELevel < AVX; } bool hasSSE4A() const { return HasSSE4A; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } @@ -205,7 +210,8 @@ public: bool hasAES() const { return HasAES; } bool hasPCLMUL() const { return HasPCLMUL; } bool hasFMA() const { return HasFMA; } - bool hasFMA4() const { return HasFMA4; } + // FIXME: Favor FMA when both are enabled. Is this the right thing to do? + bool hasFMA4() const { return HasFMA4 && !HasFMA; } bool hasXOP() const { return HasXOP; } bool hasMOVBE() const { return HasMOVBE; } bool hasRDRAND() const { return HasRDRAND; } @@ -219,6 +225,7 @@ public: bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasSlowDivide() const { return HasSlowDivide; } bool isAtom() const { return X86ProcFamily == IntelAtom; } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 80b75dc..449eed3 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -42,7 +42,6 @@ namespace { private: const TargetInstrInfo *TII; // Machine instruction info. - MachineBasicBlock *MBB; // Current basic block // Any YMM register live-in to this function? bool FnHasLiveInYmm; @@ -84,7 +83,7 @@ namespace { // 2) All states must be clean for the result to be clean // 3) If none above and one unknown, the result state is also unknown // - unsigned computeState(unsigned PrevState, unsigned CurState) { + static unsigned computeState(unsigned PrevState, unsigned CurState) { if (PrevState == ST_INIT) return CurState; @@ -122,7 +121,7 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { } static bool hasYmmReg(MachineInstr *MI) { - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg()) continue; @@ -189,7 +188,6 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; unsigned BBNum = BB.getNumber(); - MBB = &BB; // Don't process already solved BBs if (BBSolved[BBNum]) @@ -207,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // The entry MBB for the function may set the initial state to dirty if // the function receives any YMM incoming arguments - if (MBB == MF.begin()) { + if (&BB == MF.begin()) { EntryState = ST_CLEAN; if (FnHasLiveInYmm) EntryState = ST_DIRTY; @@ -253,7 +251,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // When unknown, only compute the information within the block to have // it available in the exit if possible, but don't change the block. if (EntryState != ST_UNKNOWN) { - BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); ++NumVZU; } |