diff options
Diffstat (limited to 'lib/Target/X86')
80 files changed, 12312 insertions, 11738 deletions
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 861a41d..08646d0 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \ x86_codegen_SRC_FILES := \ X86AsmPrinter.cpp \ + X86CallFrameOptimization.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ X86FloatingPoint.cpp \ diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 9c49a11..543af8e 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -165,9 +165,9 @@ public: } unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { - static const unsigned Candidates[] = { X86::RBP, X86::RAX, X86::RBX, - X86::RCX, X86::RDX, X86::RDI, - X86::RSI }; + static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, + X86::RCX, X86::RDX, X86::RDI, + X86::RSI }; for (unsigned Reg : Candidates) { if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) return convReg(Reg, VT); @@ -261,6 +261,23 @@ protected: int64_t Displacement, MCContext &Ctx, int64_t *Residue); + bool is64BitMode() const { + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + bool is32BitMode() const { + return (STI.getFeatureBits() & X86::Mode32Bit) != 0; + } + bool is16BitMode() const { + return (STI.getFeatureBits() & X86::Mode16Bit) != 0; + } + + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + // True when previous instruction was actually REP prefix. bool RepPrefix; @@ -301,7 +318,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, { const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); + getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, Out); } @@ -310,7 +327,8 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, { const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), SMLoc())); + getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), + SMLoc())); InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, Out); } @@ -319,7 +337,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, { const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); + getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); } @@ -327,7 +345,8 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, { const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), SMLoc())); + getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), + SMLoc())); InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); } @@ -445,7 +464,8 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, const MCConstantExpr *Disp = MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx); std::unique_ptr<X86Operand> DispOp = - X86Operand::CreateMem(0, Disp, Reg, 0, 1, SMLoc(), SMLoc()); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), + SMLoc()); EmitLEA(*DispOp, VT, Reg, Out); Residue -= Disp->getValue(); } @@ -459,9 +479,10 @@ X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, if (Displacement == 0 || (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { *Residue = Displacement; - return X86Operand::CreateMem(Op.getMemSegReg(), Op.getMemDisp(), - Op.getMemBaseReg(), Op.getMemIndexReg(), - Op.getMemScale(), SMLoc(), SMLoc()); + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), + Op.getMemDisp(), Op.getMemBaseReg(), + Op.getMemIndexReg(), Op.getMemScale(), + SMLoc(), SMLoc()); } int64_t OrigDisplacement = @@ -474,9 +495,9 @@ X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, *Residue = Displacement - NewDisplacement; const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx); - return X86Operand::CreateMem(Op.getMemSegReg(), Disp, Op.getMemBaseReg(), - Op.getMemIndexReg(), Op.getMemScale(), SMLoc(), - SMLoc()); + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp, + Op.getMemBaseReg(), Op.getMemIndexReg(), + Op.getMemScale(), SMLoc(), SMLoc()); } class X86AddressSanitizer32 : public X86AddressSanitizer { @@ -625,7 +646,8 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } @@ -634,7 +656,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( AddressRegI32)); @@ -644,12 +666,14 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( .addImm(7)); switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 1: break; case 2: { const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } @@ -659,9 +683,6 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( .addReg(ScratchRegI32) .addImm(3)); break; - default: - assert(false && "Incorrect access size"); - break; } EmitInstruction( @@ -669,7 +690,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); @@ -692,26 +713,25 @@ void X86AddressSanitizer32::InstrumentMemOperandLarge( { MCInst Inst; switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 8: Inst.setOpcode(X86::CMP8mi); break; case 16: Inst.setOpcode(X86::CMP16mi); break; - default: - assert(false && "Incorrect access size"); - break; } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); } MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); @@ -727,7 +747,7 @@ void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction( Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); // Instrument first and last elements in src and dst range. InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, @@ -843,7 +863,8 @@ private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, + SMLoc(), SMLoc())); EmitLEA(*Op, MVT::i64, X86::RSP, Out); OrigSPOffset += Offset; } @@ -896,7 +917,8 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } @@ -905,7 +927,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( AddressRegI32)); @@ -915,12 +937,14 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( .addImm(7)); switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 1: break; case 2: { const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } @@ -930,9 +954,6 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( .addReg(ScratchRegI32) .addImm(3)); break; - default: - assert(false && "Incorrect access size"); - break; } EmitInstruction( @@ -940,7 +961,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); @@ -963,19 +984,18 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge( { MCInst Inst; switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 8: Inst.setOpcode(X86::CMP8mi); break; case 16: Inst.setOpcode(X86::CMP16mi); break; - default: - assert(false && "Incorrect access size"); - break; } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); @@ -983,7 +1003,7 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge( MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); @@ -999,7 +1019,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction( Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); // Instrument first and last elements in src and dst range. InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 8ef2a55..0b6fb52 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -86,7 +86,7 @@ private: typedef std::pair< InfixCalculatorTok, int64_t > ICToken; SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; SmallVector<ICToken, 4> PostfixStack; - + public: int64_t popOperand() { assert (!PostfixStack.empty() && "Poped an empty stack!"); @@ -100,7 +100,7 @@ private: "Unexpected operand!"); PostfixStack.push_back(std::make_pair(Op, Val)); } - + void popOperator() { InfixOperatorStack.pop_back(); } void pushOperator(InfixCalculatorTok Op) { // Push the new operator if the stack is empty. @@ -108,7 +108,7 @@ private: InfixOperatorStack.push_back(Op); return; } - + // Push the new operator if it has a higher precedence than the operator // on the top of the stack or the operator on the top of the stack is a // left parentheses. @@ -118,7 +118,7 @@ private: InfixOperatorStack.push_back(Op); return; } - + // The operator on the top of the stack has higher precedence than the // new operator. unsigned ParenCount = 0; @@ -126,17 +126,17 @@ private: // Nothing to process. if (InfixOperatorStack.empty()) break; - + Idx = InfixOperatorStack.size() - 1; StackOp = InfixOperatorStack[Idx]; if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount)) break; - + // If we have an even parentheses count and we see a left parentheses, // then stop processing. if (!ParenCount && StackOp == IC_LPAREN) break; - + if (StackOp == IC_RPAREN) { ++ParenCount; InfixOperatorStack.pop_back(); @@ -158,10 +158,10 @@ private: if (StackOp != IC_LPAREN && StackOp != IC_RPAREN) PostfixStack.push_back(std::make_pair(StackOp, 0)); } - + if (PostfixStack.empty()) return 0; - + SmallVector<ICToken, 16> OperandStack; for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) { ICToken Op = PostfixStack[i]; @@ -263,7 +263,7 @@ private: State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac), AddImmPrefix(addimmprefix) { Info.clear(); } - + unsigned getBaseReg() { return BaseReg; } unsigned getIndexReg() { return IndexReg; } unsigned getScale() { return Scale; } @@ -684,6 +684,7 @@ private: bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -711,13 +712,6 @@ private: uint64_t &ErrorInfo, bool MatchingInlineAsm); - unsigned getPointerSize() { - if (is16BitMode()) return 16; - if (is32BitMode()) return 32; - if (is64BitMode()) return 64; - llvm_unreachable("invalid mode"); - } - bool OmitRegisterFromClobberLists(unsigned RegNo) override; /// doSrcDstMatch - Returns true if operands are matching in their @@ -977,16 +971,18 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg, - /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); } std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg, - /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); } std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { @@ -1027,8 +1023,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // Create an absolute memory reference in order to match against // instructions taking a PC relative operand. - return X86Operand::CreateMem(Disp, Start, End, Size, Identifier, - Info.OpDecl); + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size, + Identifier, Info.OpDecl); } // We either have a direct symbol reference, or an offset from a symbol. The @@ -1050,8 +1046,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // if we don't know the actual value at this time. This is necessary to // get the matching correct in some cases. BaseReg = BaseReg ? BaseReg : 1; - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size, Identifier, Info.OpDecl); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size, Identifier, + Info.OpDecl); } static void @@ -1103,7 +1100,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, (*I).Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); - // Skip everything before the symbol. + // Skip everything before the symbol. if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { assert(Len > 0 && "Expected a non-negative length."); AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); @@ -1128,7 +1125,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { // identifier. Don't try an parse it as a register. if (Tok.getString().startswith(".")) break; - + // If we're parsing an immediate expression, we don't expect a '['. if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; @@ -1194,7 +1191,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCSymbol *Sym = getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; - const MCExpr *Val = + const MCExpr *Val = MCSymbolRefExpr::Create(Sym, Variant, getContext()); if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); @@ -1279,7 +1276,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, const MCExpr *NewDisp; if (ParseIntelDotOperator(Disp, NewDisp)) return nullptr; - + End = Tok.getEndLoc(); Parser.Lex(); // Eat the field. Disp = NewDisp; @@ -1292,17 +1289,17 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // handle [-42] if (!BaseReg && !IndexReg) { if (!SegReg) - return X86Operand::CreateMem(Disp, Start, End, Size); - else - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + Start, End, Size); } StringRef ErrMsg; if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { Error(StartInBrac, ErrMsg); return nullptr; } - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size); } InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); @@ -1383,9 +1380,9 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, // be followed by a bracketed expression. If it isn't we know we have our // final segment override. const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext()); - return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0, - /*Scale=*/1, Start, ImmDispToken.getEndLoc(), - Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1, + Start, ImmDispToken.getEndLoc(), Size); } } @@ -1398,7 +1395,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, if (getParser().parsePrimaryExpr(Val, End)) return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - return X86Operand::CreateMem(Val, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); } InlineAsmIdentifierInfo Info; @@ -1428,7 +1425,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, if (getParser().parsePrimaryExpr(Val, End)) return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - return X86Operand::CreateMem(Val, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); } InlineAsmIdentifierInfo Info; @@ -1466,9 +1463,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, // BaseReg is non-zero to avoid assertions. In the context of inline asm, // we're pointing to a local variable in memory, so the base register is // really the frame or stack pointer. - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/1, /*IndexReg=*/0, - /*Scale=*/1, Start, End, Size, Identifier, - Info.OpDecl); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1, + Start, End, Size, Identifier, Info.OpDecl); } /// Parse the '.' operator. @@ -1643,7 +1640,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // to the MCExpr with the directional local symbol and this is a // memory operand not an immediate operand. if (SM.getSym()) - return X86Operand::CreateMem(SM.getSym(), Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, + Size); const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext()); return X86Operand::CreateImm(ImmExpr, Start, End); @@ -1802,8 +1800,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, if (getLexer().isNot(AsmToken::LParen)) { // Unless we have a segment register, treat this as an immediate. if (SegReg == 0) - return X86Operand::CreateMem(Disp, MemStart, ExprEnd); - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); } // Eat the '('. @@ -1829,8 +1828,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, if (getLexer().isNot(AsmToken::LParen)) { // Unless we have a segment register, treat this as an immediate. if (SegReg == 0) - return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd); - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc, + ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); } // Eat the '('. @@ -1946,9 +1947,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, } if (SegReg || BaseReg || IndexReg) - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, - MemStart, MemEnd); - return X86Operand::CreateMem(Disp, MemStart, MemEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, MemStart, MemEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd); } bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -1963,14 +1964,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName = PatchedName.substr(0, Name.size()-1); // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. - const MCExpr *ExtraImmOp = nullptr; if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; - unsigned SSECCIdx = IsVCMP ? 4 : 3; - unsigned SSEComparisonCode = StringSwitch<unsigned>( - PatchedName.slice(SSECCIdx, PatchedName.size() - 2)) + unsigned CCIdx = IsVCMP ? 4 : 3; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(CCIdx, PatchedName.size() - 2)) .Case("eq", 0x00) .Case("lt", 0x01) .Case("le", 0x02) @@ -2005,27 +2005,75 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) { - ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode, - getParser().getContext()); - if (PatchedName.endswith("ss")) { - PatchedName = IsVCMP ? "vcmpss" : "cmpss"; - } else if (PatchedName.endswith("sd")) { - PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; - } else if (PatchedName.endswith("ps")) { - PatchedName = IsVCMP ? "vcmpps" : "cmpps"; - } else { - assert(PatchedName.endswith("pd") && "Unexpected mnemonic!"); - PatchedName = IsVCMP ? "vcmppd" : "cmppd"; - } + if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) { + + Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx), + NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - 2); + } + } + + // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcmp") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("eq", 0x0) // Only allowed on unsigned. Checked below. + .Case("lt", 0x1) + .Case("le", 0x2) + //.Case("false", 0x3) // Not a documented alias. + .Case("neq", 0x4) + .Case("nlt", 0x5) + .Case("nle", 0x6) + //.Case("true", 0x7) // Not a documented alias. + .Default(~0U); + if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) { + Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + } + } + + // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcom") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("lt", 0x0) + .Case("le", 0x1) + .Case("gt", 0x2) + .Case("ge", 0x3) + .Case("eq", 0x4) + .Case("neq", 0x5) + .Case("false", 0x6) + .Case("true", 0x7) + .Default(~0U); + if (ComparisonCode != ~0U) { + Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); } } Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); - if (ExtraImmOp && !isParsingIntelSyntax()) - Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); - // Determine whether this is an instruction prefix. bool isPrefix = Name == "lock" || Name == "rep" || @@ -2071,9 +2119,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (isPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); - if (ExtraImmOp && isParsingIntelSyntax()) - Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); - // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2272,6 +2317,22 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); } +bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { + switch (Inst.getOpcode()) { + default: return true; + case X86::INT: + X86Operand &Op = static_cast<X86Operand &>(*Ops[1]); + assert(Op.isImm() && "expected immediate"); + int64_t Res; + if (!Op.getImm()->EvaluateAsAbsolute(Res) || Res > 255) { + Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]"); + return false; + } + return true; + } + llvm_unreachable("handle the instruction appropriately"); +} + bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; @@ -2432,8 +2493,11 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())) { - default: break; + default: llvm_unreachable("Unexpected match result!"); case Match_Success: + if (!validateInstruction(Inst, Operands)) + return true; + // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2614,7 +2678,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"}; for (const char *Instr : PtrSizedInstrs) { if (Mnemonic == Instr) { - UnsizedMemOp->Mem.Size = getPointerSize(); + UnsizedMemOp->Mem.Size = getPointerWidth(); break; } } @@ -2626,7 +2690,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, SmallVector<unsigned, 8> Match; uint64_t ErrorInfoMissingFeature = 0; if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { - static const unsigned MopSizes[] = {8, 16, 32, 64, 80}; + static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512}; for (unsigned Size : MopSizes) { UnsizedMemOp->Mem.Size = Size; uint64_t ErrorInfoIgnore; @@ -2648,7 +2712,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } // If we haven't matched anything yet, this is not a basic integer or FPU - // operation. There shouldn't be any ambiguity in our mneumonic table, so try + // operation. There shouldn't be any ambiguity in our mnemonic table, so try // matching with the unsized operand. if (Match.empty()) { Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, @@ -2677,6 +2741,9 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { + if (!validateInstruction(Inst, Operands)) + return true; + // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 72aeeaa..7610806 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -34,6 +34,11 @@ inline bool isImmSExti64i32Value(uint64_t Value) { (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } +inline bool isImmUnsignedi8Value(uint64_t Value) { + return (( Value <= 0x00000000000000FFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + } // End of namespace llvm #endif diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index e0fab8d..d67e119 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -53,6 +53,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned IndexReg; unsigned Scale; unsigned Size; + unsigned ModeSize; }; union { @@ -120,6 +121,10 @@ struct X86Operand : public MCParsedAsmOperand { assert(Kind == Memory && "Invalid access!"); return Mem.Scale; } + unsigned getMemModeSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.ModeSize; + } bool isToken() const override {return Kind == Token; } @@ -182,6 +187,13 @@ struct X86Operand : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + bool isImmUnsignedi8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi8Value(CE->getValue()); + } + bool isOffsetOf() const override { return OffsetOfLoc.getPointer(); } @@ -249,6 +261,10 @@ struct X86Operand : public MCParsedAsmOperand { !getMemIndexReg() && getMemScale() == 1; } + bool isAbsMem16() const { + return isAbsMem() && Mem.ModeSize == 16; + } + bool isSrcIdx() const { return !getMemIndexReg() && getMemScale() == 1 && (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI || @@ -288,21 +304,43 @@ struct X86Operand : public MCParsedAsmOperand { return isMem64() && isDstIdx(); } - bool isMemOffs8() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8); + bool isMemOffs() const { + return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() && + getMemScale() == 1; + } + + bool isMemOffs16_8() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs16_16() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16); } - bool isMemOffs16() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16); + bool isMemOffs16_32() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32); } - bool isMemOffs32() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32); + bool isMemOffs32_8() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8); } - bool isMemOffs64() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64); + bool isMemOffs32_16() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs32_32() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs32_64() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64); + } + bool isMemOffs64_8() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs64_16() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs64_32() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs64_64() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64); } bool isReg() const override { return Kind == Register; } @@ -430,8 +468,9 @@ struct X86Operand : public MCParsedAsmOperand { /// Create an absolute memory operand. static std::unique_ptr<X86Operand> - CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, - StringRef SymName = StringRef(), void *OpDecl = nullptr) { + CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -439,6 +478,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; @@ -447,9 +487,9 @@ struct X86Operand : public MCParsedAsmOperand { /// Create a generalized memory operand. static std::unique_ptr<X86Operand> - CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, - unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, StringRef SymName = StringRef(), + CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, + SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), void *OpDecl = nullptr) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. @@ -465,6 +505,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 1083fad..be61b47 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp + X86CallFrameOptimization.cpp X86FastISel.cpp X86FloatingPoint.cpp X86FrameLowering.cpp @@ -38,7 +39,7 @@ if( CMAKE_CL_64 ) ADD_CUSTOM_COMMAND( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj MAIN_DEPENDENCY X86CompilationCallback_Win64.asm - COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm + COMMAND ${CMAKE_ASM_MASM_COMPILER} /nologo /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm ) set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) endif() diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 5e8c2d6..99fb1ab 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -51,8 +51,8 @@ const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode, #define debug(s) DEBUG(Debug(__FILE__, __LINE__, s)); -namespace llvm { - +namespace llvm { + // Fill-ins to make the compiler happy. These constants are never actually // assigned; they are just filler to make an automatically-generated switch // statement work. @@ -127,11 +127,11 @@ static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) { static void logger(void* arg, const char* log) { if (!arg) return; - + raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); vStream << log << "\n"; -} - +} + // // Public interface for the disassembler // @@ -184,7 +184,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) { } /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the -/// immediate Value in the MCInst. +/// immediate Value in the MCInst. /// /// @param Value - The immediate Value, has had any PC adjustment made by /// the caller. @@ -196,7 +196,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) { /// If the getOpInfo() function was set when setupForSymbolicDisassembly() was /// called then that function is called to get any symbolic information for the /// immediate in the instruction using the Address, Offset and Width. If that -/// returns non-zero then the symbolic information it returns is used to create +/// returns non-zero then the symbolic information it returns is used to create /// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() /// returns zero and isBranch is true then a symbol look up for immediate Value /// is done and if a symbol is found an MCExpr is created with that, else @@ -204,8 +204,8 @@ static void translateRegister(MCInst &mcInst, Reg reg) { /// if it adds an operand to the MCInst and false otherwise. static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, uint64_t Address, uint64_t Offset, - uint64_t Width, MCInst &MI, - const MCDisassembler *Dis) { + uint64_t Width, MCInst &MI, + const MCDisassembler *Dis) { return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, Width); } @@ -215,7 +215,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, /// These can often be addresses in a literal pool. The Address of the /// instruction and its immediate Value are used to determine the address /// being referenced in the literal pool entry. The SymbolLookUp call back will -/// return a pointer to a literal 'C' string if the referenced address is an +/// return a pointer to a literal 'C' string if the referenced address is an /// address into a section with 'C' string literals. static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, const void *Decoder) { @@ -287,7 +287,7 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) { static void translateImmediate(MCInst &mcInst, uint64_t immediate, const OperandSpecifier &operand, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { // Sign-extend the immediate if necessary. OperandType type = (OperandType)operand.type; @@ -320,24 +320,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, // By default sign-extend all X86 immediates based on their encoding. else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || type == TYPE_IMM64 || type == TYPE_IMMv) { - uint32_t Opcode = mcInst.getOpcode(); switch (operand.encoding) { default: break; case ENCODING_IB: - // Special case those X86 instructions that use the imm8 as a set of - // bits, bit count, etc. and are not sign-extend. - if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri && - Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && - Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && - Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && - Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && - Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && - Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && - Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && - Opcode != X86::VINSERTPSrr) - if(immediate & 0x80) - immediate |= ~(0xffull); + if(immediate & 0x80) + immediate |= ~(0xffull); break; case ENCODING_IW: if(immediate & 0x8000) @@ -350,6 +338,199 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case ENCODING_IO: break; } + } else if (type == TYPE_IMM3) { + // Check for immediates that printSSECC can't handle. + if (immediate >= 8) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; + case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; + case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; + case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; + case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; + case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; + case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; + case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; + case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break; + case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break; + case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break; + case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break; + case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break; + case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break; + case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break; + case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break; + case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break; + case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break; + case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break; + case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break; + case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break; + case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break; + case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break; + case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_IMM5) { + // Check for immediates that printAVXCC can't handle. + if (immediate >= 32) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_AVX512ICC) { + if (immediate >= 8 || ((immediate & 0x3) == 3)) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break; + case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break; + case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break; + case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break; + case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break; + case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break; + case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break; + case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break; + case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break; + case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break; + case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break; + case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break; + case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break; + case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break; + case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break; + case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break; + case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break; + case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break; + case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break; + case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break; + case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break; + case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break; + case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break; + case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break; + case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break; + case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break; + case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break; + case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break; + case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break; + case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break; + case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break; + case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break; + case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break; + case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break; + case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break; + case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break; + case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break; + case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break; + case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break; + case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break; + case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break; + case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break; + case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break; + case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break; + case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break; + case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break; + case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break; + case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break; + case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break; + case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break; + case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break; + case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break; + case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break; + case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break; + case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break; + case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break; + case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break; + case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break; + case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break; + case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break; + case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } } switch (type) { @@ -407,7 +588,7 @@ static bool translateRMRegister(MCInst &mcInst, debug("A R/M register operand may not have a SIB byte"); return true; } - + switch (insn.eaBase) { default: debug("Unexpected EA base register"); @@ -427,7 +608,7 @@ static bool translateRMRegister(MCInst &mcInst, ALL_REGS #undef ENTRY } - + return false; } @@ -440,26 +621,26 @@ static bool translateRMRegister(MCInst &mcInst, /// from. /// @return - 0 on success; nonzero otherwise static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { // Addresses in an MCInst are represented as five operands: - // 1. basereg (register) The R/M base, or (if there is a SIB) the + // 1. basereg (register) The R/M base, or (if there is a SIB) the // SIB base - // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified + // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified // scale amount // 3. indexreg (register) x86_registerNONE, or (if there is a SIB) - // the index (which is multiplied by the + // the index (which is multiplied by the // scale amount) // 4. displacement (immediate) 0, or the displacement if there is one // 5. segmentreg (register) x86_registerNONE for now, but could be set // if we have segment overrides - + MCOperand baseReg; MCOperand scaleAmount; MCOperand indexReg; MCOperand displacement; MCOperand segmentReg; uint64_t pcrel = 0; - + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { if (insn.sibBase != SIB_BASE_NONE) { switch (insn.sibBase) { @@ -512,7 +693,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 : IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; - insn.sibIndex = (SIBIndex)(IndexBase + + insn.sibIndex = (SIBIndex)(IndexBase + (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); } @@ -534,7 +715,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } else { indexReg = MCOperand::CreateReg(0); } - + scaleAmount = MCOperand::CreateImm(insn.sibScale); } else { switch (insn.eaBase) { @@ -553,7 +734,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } else baseReg = MCOperand::CreateReg(0); - + indexReg = MCOperand::CreateReg(0); break; case EA_BASE_BX_SI: @@ -584,7 +765,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, // placeholders to keep the compiler happy. #define ENTRY(x) \ case EA_BASE_##x: \ - baseReg = MCOperand::CreateReg(X86::x); break; + baseReg = MCOperand::CreateReg(X86::x); break; ALL_EA_BASES #undef ENTRY #define ENTRY(x) case EA_REG_##x: @@ -595,14 +776,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, return true; } } - + scaleAmount = MCOperand::CreateImm(1); } - + displacement = MCOperand::CreateImm(insn.displacement); segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); - + mcInst.addOperand(baseReg); mcInst.addOperand(scaleAmount); mcInst.addOperand(indexReg); @@ -623,7 +804,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, /// from. /// @return - 0 on success; nonzero otherwise static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, - InternalInstruction &insn, const MCDisassembler *Dis) { + InternalInstruction &insn, const MCDisassembler *Dis) { switch (operand.type) { default: debug("Unexpected type for a R/M operand"); @@ -633,8 +814,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_R32: case TYPE_R64: case TYPE_Rv: - case TYPE_MM: - case TYPE_MM32: case TYPE_MM64: case TYPE_XMM: case TYPE_XMM32: @@ -660,9 +839,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_M32FP: case TYPE_M64FP: case TYPE_M80FP: - case TYPE_M16INT: - case TYPE_M32INT: - case TYPE_M64INT: case TYPE_M1616: case TYPE_M1632: case TYPE_M1664: @@ -670,7 +846,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, return translateRMMemory(mcInst, insn, Dis); } } - + /// translateFPRegister - Translates a stack position on the FPU stack to its /// LLVM form, and appends it to an MCInst. /// @@ -698,7 +874,7 @@ static bool translateMaskRegister(MCInst &mcInst, return false; } -/// translateOperand - Translates an operand stored in an internal instruction +/// translateOperand - Translates an operand stored in an internal instruction /// to LLVM's format and appends it to an MCInst. /// /// @param mcInst - The MCInst to append to. @@ -707,7 +883,7 @@ static bool translateMaskRegister(MCInst &mcInst, /// @return - false on success; true otherwise. static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { switch (operand.encoding) { default: debug("Unhandled operand encoding during translation"); @@ -761,7 +937,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, insn, Dis); } } - + /// translateInstruction - Translates an internal instruction and all its /// operands to an MCInst. /// @@ -770,12 +946,12 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, /// @return - false on success; true otherwise. static bool translateInstruction(MCInst &mcInst, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { if (!insn.spec) { debug("Instruction has no specification"); return true; } - + mcInst.setOpcode(insn.instructionID); // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 // prefix bytes should be disassembled as xrelease and xacquire then set the @@ -786,9 +962,9 @@ static bool translateInstruction(MCInst &mcInst, else if(mcInst.getOpcode() == X86::REPNE_PREFIX) mcInst.setOpcode(X86::XACQUIRE_PREFIX); } - + insn.numImmediatesTranslated = 0; - + for (const auto &Op : insn.operands) { if (Op.encoding != ENCODING_NONE) { if (translateOperand(mcInst, Op, insn, Dis)) { @@ -796,7 +972,7 @@ static bool translateInstruction(MCInst &mcInst, } } } - + return false; } @@ -807,9 +983,9 @@ static MCDisassembler *createX86Disassembler(const Target &T, return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII)); } -extern "C" void LLVMInitializeX86Disassembler() { +extern "C" void LLVMInitializeX86Disassembler() { // Register the disassembler. - TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + TargetRegistry::RegisterMCDisassembler(TheX86_32Target, createX86Disassembler); TargetRegistry::RegisterMCDisassembler(TheX86_64Target, createX86Disassembler); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 98b3440..619a0d4 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -975,27 +975,16 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (insn->rexPrefix & 0x08) attrMask |= ATTR_REXW; - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; - /* * JCXZ/JECXZ need special handling for 16-bit mode because the meaning * of the AdSize prefix is inverted w.r.t. 32-bit mode. */ - if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) { - const struct InstructionSpecifier *spec; - spec = specifierForUID(instructionID); + if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE && + insn->opcode == 0xE3) + attrMask ^= ATTR_ADSIZE; - /* - * Check for Ii8PCRel instructions. We could alternatively do a - * string-compare on the names, but this is probably cheaper. - */ - if (x86OperandSets[spec->operands][0].type == TYPE_REL8) { - attrMask ^= ATTR_ADSIZE; - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; - } - } + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; /* The following clauses compensate for limitations of the tables. */ @@ -1030,6 +1019,32 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { } } + /* + * Absolute moves need special handling. + * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are + * inverted w.r.t. + * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in + * any position. + */ + if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) { + /* Make sure we observed the prefixes in any position. */ + if (insn->prefixPresent[0x67]) + attrMask |= ATTR_ADSIZE; + if (insn->prefixPresent[0x66]) + attrMask |= ATTR_OPSIZE; + + /* In 16-bit, invert the attributes. */ + if (insn->mode == MODE_16BIT) + attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE; + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } + if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) && !(attrMask & ATTR_OPSIZE)) { /* @@ -1445,22 +1460,14 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_VK16: \ return prefix##_K0 + index; \ case TYPE_MM64: \ - case TYPE_MM32: \ - case TYPE_MM: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_MM0 + index; \ + return prefix##_MM0 + (index & 0x7); \ case TYPE_SEGMENTREG: \ if (index > 5) \ *valid = 0; \ return prefix##_ES + index; \ case TYPE_DEBUGREG: \ - if (index > 7) \ - *valid = 0; \ return prefix##_DR0 + index; \ case TYPE_CONTROLREG: \ - if (index > 8) \ - *valid = 0; \ return prefix##_CR0 + index; \ } \ } @@ -1737,12 +1744,6 @@ static int readOperands(struct InternalInstruction* insn) { } if (readImmediate(insn, 1)) return -1; - if (Op.type == TYPE_IMM3 && - insn->immediates[insn->numImmediatesConsumed - 1] > 7) - return -1; - if (Op.type == TYPE_IMM5 && - insn->immediates[insn->numImmediatesConsumed - 1] > 31) - return -1; if (Op.type == TYPE_XMM128 || Op.type == TYPE_XMM256) sawRegImm = 1; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 457b382..a79a923 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -341,7 +341,15 @@ namespace X86Disassembler { ENTRY(DR4) \ ENTRY(DR5) \ ENTRY(DR6) \ - ENTRY(DR7) + ENTRY(DR7) \ + ENTRY(DR8) \ + ENTRY(DR9) \ + ENTRY(DR10) \ + ENTRY(DR11) \ + ENTRY(DR12) \ + ENTRY(DR13) \ + ENTRY(DR14) \ + ENTRY(DR15) #define REGS_CONTROL \ ENTRY(CR0) \ @@ -352,7 +360,14 @@ namespace X86Disassembler { ENTRY(CR5) \ ENTRY(CR6) \ ENTRY(CR7) \ - ENTRY(CR8) + ENTRY(CR8) \ + ENTRY(CR9) \ + ENTRY(CR10) \ + ENTRY(CR11) \ + ENTRY(CR12) \ + ENTRY(CR13) \ + ENTRY(CR14) \ + ENTRY(CR15) #define ALL_EA_BASES \ EA_BASES_16BIT \ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index bec4f0e..70c6042 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -82,6 +82,7 @@ enum attributeBits { "operands change width") \ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \ "operands change width") \ + ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ "but not the operands") \ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ @@ -90,20 +91,24 @@ enum attributeBits { "operands change width") \ ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \ "operands change width") \ - ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\ + ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\ "change width; overrides IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \ + "prefix") \ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \ - ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \ + ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \ + "IC_ADSIZE") \ + ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \ "secondary") \ - ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \ + ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \ ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \ ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \ - ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \ + ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \ "opcode") \ - ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \ + ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \ "IC_64BIT_REXW_XS") \ - ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \ + ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \ "else because this changes most " \ "operands' meaning") \ ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ @@ -401,6 +406,8 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_IMM64, "8-byte") \ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ + ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ + ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ ENUM_ENTRY(TYPE_RM16, "2-byte") \ ENUM_ENTRY(TYPE_RM32, "4-byte") \ @@ -416,10 +423,6 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ - ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \ - ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \ - ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \ - ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \ ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \ ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \ ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \ @@ -438,14 +441,8 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ ENUM_ENTRY(TYPE_M64FP, "64-bit") \ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ - ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \ - "floating-point instructions") \ - ENUM_ENTRY(TYPE_M32INT, "4-byte") \ - ENUM_ENTRY(TYPE_M64INT, "8-byte") \ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ - ENUM_ENTRY(TYPE_MM, "MMX register operand") \ - ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \ - ENUM_ENTRY(TYPE_MM64, "8-byte") \ + ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \ ENUM_ENTRY(TYPE_XMM, "XMM register operand") \ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ ENUM_ENTRY(TYPE_XMM64, "8-byte") \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index b72730c..65461af 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -72,35 +72,11 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, printAnnotation(OS, Annot); } -void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0xf; - switch (Imm) { - default: llvm_unreachable("Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - } -} - -void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; +void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); switch (Imm) { - default: llvm_unreachable("Invalid avxcc argument!"); + default: llvm_unreachable("Invalid ssecc/avxcc argument!"); case 0: O << "eq"; break; case 1: O << "lt"; break; case 2: O << "le"; break; @@ -136,8 +112,24 @@ void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, } } -void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, +void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } +} + +void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, + raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; switch (Imm) { case 0: O << "{rn-sae}"; break; @@ -163,8 +155,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { O << formatHex((uint64_t)Address); - } - else { + } else { // Otherwise, just print the expression. O << *Op.getExpr(); } @@ -295,3 +286,10 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, O << markup(">"); } + +void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<imm:") + << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) + << markup(">"); +} diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 41be14b..f71cb81 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -45,18 +45,23 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -137,7 +142,7 @@ public: private: bool HasCustomInstComment; }; - + } #endif diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index a8f15e6..10a1482 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -1,724 +1,982 @@ -//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This defines functionality used to emit comments about X86 instructions to -// an output stream for -fverbose-asm. -// -//===----------------------------------------------------------------------===// - -#include "X86InstComments.h" -#include "MCTargetDesc/X86MCTargetDesc.h" -#include "Utils/X86ShuffleDecode.h" -#include "llvm/MC/MCInst.h" -#include "llvm/CodeGen/MachineValueType.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Top Level Entrypoint -//===----------------------------------------------------------------------===// - -/// EmitAnyX86InstComments - This function decodes x86 instructions and prints -/// newline terminated strings to the specified string if desired. This -/// information is shown in disassembly dumps when verbose assembly is enabled. -bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, - const char *(*getRegName)(unsigned)) { - // If this is a shuffle operation, the switch should fill in this state. - SmallVector<int, 8> ShuffleMask; - const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; - - switch (MI->getOpcode()) { - default: - // Not an instruction for which we can decode comments. - return false; - - case X86::BLENDPDrri: - case X86::VBLENDPDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::BLENDPDrmi: - case X86::VBLENDPDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VBLENDPDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::BLENDPSrri: - case X86::VBLENDPSrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::BLENDPSrmi: - case X86::VBLENDPSrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VBLENDPSYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::PBLENDWrri: - case X86::VPBLENDWrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PBLENDWrmi: - case X86::VPBLENDWrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDWYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDWYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::VPBLENDDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::VPBLENDDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeBLENDMask(MVT::v8i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::INSERTPSrr: - case X86::VINSERTPSrr: - DestName = getRegName(MI->getOperand(0).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - Src2Name = getRegName(MI->getOperand(2).getReg()); - if(MI->getOperand(3).isImm()) - DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); - break; - - case X86::MOVLHPSrr: - case X86::VMOVLHPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVLHPSMask(2, ShuffleMask); - break; - - case X86::MOVHLPSrr: - case X86::VMOVHLPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVHLPSMask(2, ShuffleMask); - break; - - case X86::MOVSLDUPrr: - case X86::VMOVSLDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSLDUPrm: - case X86::VMOVSLDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::VMOVSHDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSHDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::VMOVSLDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSLDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::MOVSHDUPrr: - case X86::VMOVSHDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSHDUPrm: - case X86::VMOVSHDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::PSLLDQri: - case X86::VPSLLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSLLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::VPSLLDQYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSLLDQMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::PSRLDQri: - case X86::VPSRLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSRLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::VPSRLDQYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSRLDQMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::PALIGNR128rr: - case X86::VPALIGNR128rr: - Src1Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PALIGNR128rm: - case X86::VPALIGNR128rm: - Src2Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePALIGNRMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - case X86::VPALIGNR256rr: - Src1Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPALIGNR256rm: - Src2Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePALIGNRMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::PSHUFDri: - case X86::VPSHUFDri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::PSHUFDmi: - case X86::VPSHUFDmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPSHUFDYmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v8i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - - case X86::PSHUFHWri: - case X86::VPSHUFHWri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::PSHUFHWmi: - case X86::VPSHUFHWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFHWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFHWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPSHUFHWYmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFHWMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - case X86::PSHUFLWri: - case X86::VPSHUFLWri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::PSHUFLWmi: - case X86::VPSHUFLWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFLWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFLWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPSHUFLWYmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFLWMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - break; - - case X86::PUNPCKHBWrr: - case X86::VPUNPCKHBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHBWrm: - case X86::VPUNPCKHBWrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKHBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKHWDrr: - case X86::VPUNPCKHWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHWDrm: - case X86::VPUNPCKHWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); - break; - case X86::VPUNPCKHWDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHWDYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); - break; - case X86::PUNPCKHDQrr: - case X86::VPUNPCKHDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHDQrm: - case X86::VPUNPCKHDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); - break; - case X86::VPUNPCKHDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); - break; - case X86::PUNPCKHQDQrr: - case X86::VPUNPCKHQDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHQDQrm: - case X86::VPUNPCKHQDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); - break; - - case X86::PUNPCKLBWrr: - case X86::VPUNPCKLBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLBWrm: - case X86::VPUNPCKLBWrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKLBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKLWDrr: - case X86::VPUNPCKLWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLWDrm: - case X86::VPUNPCKLWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); - break; - case X86::VPUNPCKLWDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLWDYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); - break; - case X86::PUNPCKLDQrr: - case X86::VPUNPCKLDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLDQrm: - case X86::VPUNPCKLDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); - break; - case X86::VPUNPCKLDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); - break; - case X86::PUNPCKLQDQrr: - case X86::VPUNPCKLQDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLQDQrm: - case X86::VPUNPCKLQDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); - break; - case X86::VPUNPCKLQDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLQDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); - break; - - case X86::SHUFPDrri: - case X86::VSHUFPDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::SHUFPDrmi: - case X86::VSHUFPDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeSHUFPMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VSHUFPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VSHUFPDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeSHUFPMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::SHUFPSrri: - case X86::VSHUFPSrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::SHUFPSrmi: - case X86::VSHUFPSrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeSHUFPMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VSHUFPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VSHUFPSYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeSHUFPMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - - case X86::UNPCKLPDrr: - case X86::VUNPCKLPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPDrm: - case X86::VUNPCKLPDrm: - DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDYrm: - DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKLPSrr: - case X86::VUNPCKLPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPSrm: - case X86::VUNPCKLPSrm: - DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSYrm: - DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKHPDrr: - case X86::VUNPCKHPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKHPDrm: - case X86::VUNPCKHPDrm: - DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKHPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKHPDYrm: - DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKHPSrr: - case X86::VUNPCKHPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKHPSrm: - case X86::VUNPCKHPSrm: - DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKHPSYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKHPSYrm: - DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPSri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPSYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPDri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPDmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPDYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodePSHUFMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERM2F128rr: - case X86::VPERM2I128rr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPERM2F128rm: - case X86::VPERM2I128rm: - // For instruction comments purpose, assume the 256-bit vector is v4i64. - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeVPERM2X128Mask(MVT::v4i64, - MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMQYri: - case X86::VPERMPDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMQYmi: - case X86::VPERMPDYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - } - - // The only comments we decode are shuffles, so give up if we were unable to - // decode a shuffle mask. - if (ShuffleMask.empty()) - return false; - - if (!DestName) DestName = Src1Name; - OS << (DestName ? DestName : "mem") << " = "; - - // If the two sources are the same, canonicalize the input elements to be - // from the first src so that we get larger element spans. - if (Src1Name == Src2Name) { - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= (int)e) // From second mask. - ShuffleMask[i] -= e; - } - } - - // The shuffle mask specifies which elements of the src1/src2 fill in the - // destination, with a few sentinel values. Loop through and print them - // out. - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if (i != 0) - OS << ','; - if (ShuffleMask[i] == SM_SentinelZero) { - OS << "zero"; - continue; - } - - // Otherwise, it must come from src1 or src2. Print the span of elements - // that comes from this src. - bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); - const char *SrcName = isSrc1 ? Src1Name : Src2Name; - OS << (SrcName ? SrcName : "mem") << '['; - bool IsFirst = true; - while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && - (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { - if (!IsFirst) - OS << ','; - else - IsFirst = false; - if (ShuffleMask[i] == SM_SentinelUndef) - OS << "u"; - else - OS << ShuffleMask[i] % ShuffleMask.size(); - ++i; - } - OS << ']'; - --i; // For loop increments element #. - } - //MI->print(OS, 0); - OS << "\n"; - - // We successfully added a comment to this instruction. - return true; -} +//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Extracts the src/dst types for a given zero extension instruction.
+/// \note While the number of elements in DstVT type correct, the
+/// number in the SrcVT type is expanded to fill the src xmm register and the
+/// upper elements may not be included in the dst xmm/ymm register.
+static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown zero extension instruction");
+ // i8 zero extension
+ case X86::PMOVZXBWrm:
+ case X86::PMOVZXBWrr:
+ case X86::VPMOVZXBWrm:
+ case X86::VPMOVZXBWrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v8i16;
+ break;
+ case X86::VPMOVZXBWYrm:
+ case X86::VPMOVZXBWYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v16i16;
+ break;
+ case X86::PMOVZXBDrm:
+ case X86::PMOVZXBDrr:
+ case X86::VPMOVZXBDrm:
+ case X86::VPMOVZXBDrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v4i32;
+ break;
+ case X86::VPMOVZXBDYrm:
+ case X86::VPMOVZXBDYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v8i32;
+ break;
+ case X86::PMOVZXBQrm:
+ case X86::PMOVZXBQrr:
+ case X86::VPMOVZXBQrm:
+ case X86::VPMOVZXBQrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXBQYrm:
+ case X86::VPMOVZXBQYrr:
+ SrcVT = MVT::v16i8;
+ DstVT = MVT::v4i64;
+ break;
+ // i16 zero extension
+ case X86::PMOVZXWDrm:
+ case X86::PMOVZXWDrr:
+ case X86::VPMOVZXWDrm:
+ case X86::VPMOVZXWDrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v4i32;
+ break;
+ case X86::VPMOVZXWDYrm:
+ case X86::VPMOVZXWDYrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v8i32;
+ break;
+ case X86::PMOVZXWQrm:
+ case X86::PMOVZXWQrr:
+ case X86::VPMOVZXWQrm:
+ case X86::VPMOVZXWQrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXWQYrm:
+ case X86::VPMOVZXWQYrr:
+ SrcVT = MVT::v8i16;
+ DstVT = MVT::v4i64;
+ break;
+ // i32 zero extension
+ case X86::PMOVZXDQrm:
+ case X86::PMOVZXDQrr:
+ case X86::VPMOVZXDQrm:
+ case X86::VPMOVZXDQrr:
+ SrcVT = MVT::v4i32;
+ DstVT = MVT::v2i64;
+ break;
+ case X86::VPMOVZXDQYrm:
+ case X86::VPMOVZXDQYrr:
+ SrcVT = MVT::v4i32;
+ DstVT = MVT::v4i64;
+ break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired. This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+ const char *(*getRegName)(unsigned)) {
+ // If this is a shuffle operation, the switch should fill in this state.
+ SmallVector<int, 8> ShuffleMask;
+ const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+
+ switch (MI->getOpcode()) {
+ default:
+ // Not an instruction for which we can decode comments.
+ return false;
+
+ case X86::BLENDPDrri:
+ case X86::VBLENDPDrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::BLENDPDrmi:
+ case X86::VBLENDPDrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v2f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VBLENDPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VBLENDPDYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v4f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::BLENDPSrri:
+ case X86::VBLENDPSrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::BLENDPSrmi:
+ case X86::VBLENDPSrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v4f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VBLENDPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VBLENDPSYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v8f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::PBLENDWrri:
+ case X86::VPBLENDWrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PBLENDWrmi:
+ case X86::VPBLENDWrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPBLENDWYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPBLENDWYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPBLENDDrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPBLENDDrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v4i32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::VPBLENDDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPBLENDDYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeBLENDMask(MVT::v8i32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::INSERTPSrm:
+ case X86::VINSERTPSrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::MOVLHPSrr:
+ case X86::VMOVLHPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVLHPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVHLPSrr:
+ case X86::VMOVHLPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVHLPSMask(2, ShuffleMask);
+ break;
+
+ case X86::MOVSLDUPrr:
+ case X86::VMOVSLDUPrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSLDUPrm:
+ case X86::VMOVSLDUPrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
+ break;
+
+ case X86::VMOVSHDUPYrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VMOVSHDUPYrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
+ break;
+
+ case X86::VMOVSLDUPYrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VMOVSLDUPYrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+ break;
+
+ case X86::MOVSHDUPrr:
+ case X86::VMOVSHDUPrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSHDUPrm:
+ case X86::VMOVSHDUPrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+ break;
+
+ case X86::VMOVDDUPYrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VMOVDDUPYrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask);
+ break;
+
+ case X86::MOVDDUPrr:
+ case X86::VMOVDDUPrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVDDUPrm:
+ case X86::VMOVDDUPrm:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask);
+ break;
+
+ case X86::PSLLDQri:
+ case X86::VPSLLDQri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSLLDQMask(MVT::v16i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::VPSLLDQYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSLLDQMask(MVT::v32i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSRLDQri:
+ case X86::VPSRLDQri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSRLDQMask(MVT::v16i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::VPSRLDQYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSRLDQMask(MVT::v32i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PALIGNR128rr:
+ case X86::VPALIGNR128rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PALIGNR128rm:
+ case X86::VPALIGNR128rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePALIGNRMask(MVT::v16i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPALIGNR256rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPALIGNR256rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePALIGNRMask(MVT::v32i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PSHUFDri:
+ case X86::VPSHUFDri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFDmi:
+ case X86::VPSHUFDmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v4i32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFDYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v8i32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+
+ case X86::PSHUFHWri:
+ case X86::VPSHUFHWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFHWmi:
+ case X86::VPSHUFHWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFHWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFHWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFHWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFHWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::PSHUFLWri:
+ case X86::VPSHUFLWri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PSHUFLWmi:
+ case X86::VPSHUFLWmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFLWMask(MVT::v8i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPSHUFLWYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPSHUFLWYmi:
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFLWMask(MVT::v16i16,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
+ case X86::PUNPCKHBWrr:
+ case X86::VPUNPCKHBWrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHBWrm:
+ case X86::VPUNPCKHBWrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+ break;
+ case X86::VPUNPCKHBWYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHBWYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
+ break;
+ case X86::PUNPCKHWDrr:
+ case X86::VPUNPCKHWDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHWDrm:
+ case X86::VPUNPCKHWDrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+ break;
+ case X86::VPUNPCKHWDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHWDYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
+ break;
+ case X86::PUNPCKHDQrr:
+ case X86::VPUNPCKHDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHDQrm:
+ case X86::VPUNPCKHDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKHDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKHDQZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHDQZrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v16i32, ShuffleMask);
+ break;
+ case X86::PUNPCKHQDQrr:
+ case X86::VPUNPCKHQDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKHQDQrm:
+ case X86::VPUNPCKHQDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKHQDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHQDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKHQDQZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKHQDQZrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKHMask(MVT::v8i64, ShuffleMask);
+ break;
+
+ case X86::PUNPCKLBWrr:
+ case X86::VPUNPCKLBWrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLBWrm:
+ case X86::VPUNPCKLBWrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+ break;
+ case X86::VPUNPCKLBWYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLBWYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
+ break;
+ case X86::PUNPCKLWDrr:
+ case X86::VPUNPCKLWDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLWDrm:
+ case X86::VPUNPCKLWDrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+ break;
+ case X86::VPUNPCKLWDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLWDYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+ break;
+ case X86::PUNPCKLDQrr:
+ case X86::VPUNPCKLDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLDQrm:
+ case X86::VPUNPCKLDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKLDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+ break;
+ case X86::VPUNPCKLDQZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLDQZrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v16i32, ShuffleMask);
+ break;
+ case X86::PUNPCKLQDQrr:
+ case X86::VPUNPCKLQDQrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PUNPCKLQDQrm:
+ case X86::VPUNPCKLQDQrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKLQDQYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLQDQYrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+ break;
+ case X86::VPUNPCKLQDQZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPUNPCKLQDQZrm:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(MVT::v8i64, ShuffleMask);
+ break;
+
+ case X86::SHUFPDrri:
+ case X86::VSHUFPDrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::SHUFPDrmi:
+ case X86::VSHUFPDrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeSHUFPMask(MVT::v2f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VSHUFPDYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VSHUFPDYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeSHUFPMask(MVT::v4f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::SHUFPSrri:
+ case X86::VSHUFPSrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::SHUFPSrmi:
+ case X86::VSHUFPSrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeSHUFPMask(MVT::v4f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VSHUFPSYrri:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VSHUFPSYrmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeSHUFPMask(MVT::v8f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::UNPCKLPDrr:
+ case X86::VUNPCKLPDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKLPDrm:
+ case X86::VUNPCKLPDrm:
+ DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPDYrm:
+ DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPDZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPDZrm:
+ DecodeUNPCKLMask(MVT::v8f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKLPSrr:
+ case X86::VUNPCKLPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKLPSrm:
+ case X86::VUNPCKLPSrm:
+ DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPSYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPSYrm:
+ DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKLPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKLPSZrm:
+ DecodeUNPCKLMask(MVT::v16f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKHPDrr:
+ case X86::VUNPCKHPDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKHPDrm:
+ case X86::VUNPCKHPDrm:
+ DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPDYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPDYrm:
+ DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPDZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPDZrm:
+ DecodeUNPCKHMask(MVT::v8f64, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::UNPCKHPSrr:
+ case X86::VUNPCKHPSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::UNPCKHPSrm:
+ case X86::VUNPCKHPSrm:
+ DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPSYrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPSYrm:
+ DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VUNPCKHPSZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VUNPCKHPSZrm:
+ DecodeUNPCKHMask(MVT::v16f32, ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPSri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPSmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v4f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPSYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPSYmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v8f32,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPDri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPDmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v2f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMILPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMILPDYmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodePSHUFMask(MVT::v4f64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPERM2F128rm:
+ case X86::VPERM2I128rm:
+ // For instruction comments purpose, assume the 256-bit vector is v4i64.
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeVPERM2X128Mask(MVT::v4i64,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::VPERMQYri:
+ case X86::VPERMPDYri:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::VPERMQYmi:
+ case X86::VPERMPDYmi:
+ if(MI->getOperand(MI->getNumOperands()-1).isImm())
+ DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSDrm:
+ case X86::VMOVSDrm:
+ DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVSSrm:
+ case X86::VMOVSSrm:
+ DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::MOVPQI2QIrr:
+ case X86::MOVZPQILo2PQIrr:
+ case X86::VMOVPQI2QIrr:
+ case X86::VMOVZPQILo2PQIrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::MOVQI2PQIrm:
+ case X86::MOVZQI2PQIrm:
+ case X86::MOVZPQILo2PQIrm:
+ case X86::VMOVQI2PQIrm:
+ case X86::VMOVZQI2PQIrm:
+ case X86::VMOVZPQILo2PQIrm:
+ DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ case X86::MOVDI2PDIrm:
+ case X86::VMOVDI2PDIrm:
+ DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
+ case X86::PMOVZXBWrr:
+ case X86::PMOVZXBDrr:
+ case X86::PMOVZXBQrr:
+ case X86::PMOVZXWDrr:
+ case X86::PMOVZXWQrr:
+ case X86::PMOVZXDQrr:
+ case X86::VPMOVZXBWrr:
+ case X86::VPMOVZXBDrr:
+ case X86::VPMOVZXBQrr:
+ case X86::VPMOVZXWDrr:
+ case X86::VPMOVZXWQrr:
+ case X86::VPMOVZXDQrr:
+ case X86::VPMOVZXBWYrr:
+ case X86::VPMOVZXBDYrr:
+ case X86::VPMOVZXBQYrr:
+ case X86::VPMOVZXWDYrr:
+ case X86::VPMOVZXWQYrr:
+ case X86::VPMOVZXDQYrr:
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ // FALL THROUGH.
+ case X86::PMOVZXBWrm:
+ case X86::PMOVZXBDrm:
+ case X86::PMOVZXBQrm:
+ case X86::PMOVZXWDrm:
+ case X86::PMOVZXWQrm:
+ case X86::PMOVZXDQrm:
+ case X86::VPMOVZXBWrm:
+ case X86::VPMOVZXBDrm:
+ case X86::VPMOVZXBQrm:
+ case X86::VPMOVZXWDrm:
+ case X86::VPMOVZXWQrm:
+ case X86::VPMOVZXDQrm:
+ case X86::VPMOVZXBWYrm:
+ case X86::VPMOVZXBDYrm:
+ case X86::VPMOVZXBQYrm:
+ case X86::VPMOVZXWDYrm:
+ case X86::VPMOVZXWQYrm:
+ case X86::VPMOVZXDQYrm: {
+ MVT SrcVT, DstVT;
+ getZeroExtensionTypes(MI, SrcVT, DstVT);
+ DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ } break;
+ }
+
+ // The only comments we decode are shuffles, so give up if we were unable to
+ // decode a shuffle mask.
+ if (ShuffleMask.empty())
+ return false;
+
+ if (!DestName) DestName = Src1Name;
+ OS << (DestName ? DestName : "mem") << " = ";
+
+ // If the two sources are the same, canonicalize the input elements to be
+ // from the first src so that we get larger element spans.
+ if (Src1Name == Src2Name) {
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+ ShuffleMask[i] >= (int)e) // From second mask.
+ ShuffleMask[i] -= e;
+ }
+ }
+
+ // The shuffle mask specifies which elements of the src1/src2 fill in the
+ // destination, with a few sentinel values. Loop through and print them
+ // out.
+ for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+ if (i != 0)
+ OS << ',';
+ if (ShuffleMask[i] == SM_SentinelZero) {
+ OS << "zero";
+ continue;
+ }
+
+ // Otherwise, it must come from src1 or src2. Print the span of elements
+ // that comes from this src.
+ bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+ const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+ OS << (SrcName ? SrcName : "mem") << '[';
+ bool IsFirst = true;
+ while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+ (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+ if (!IsFirst)
+ OS << ',';
+ else
+ IsFirst = false;
+ if (ShuffleMask[i] == SM_SentinelUndef)
+ OS << "u";
+ else
+ OS << ShuffleMask[i] % ShuffleMask.size();
+ ++i;
+ }
+ OS << ']';
+ --i; // For loop increments element #.
+ }
+ //MI->print(OS, 0);
+ OS << "\n";
+
+ // We successfully added a comment to this instruction.
+ return true;
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 1c8466b..91d1828 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -50,33 +50,9 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); } -void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0xf; - switch (Imm) { - default: llvm_unreachable("Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - } -} - -void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; +void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); switch (Imm) { default: llvm_unreachable("Invalid avxcc argument!"); case 0: O << "eq"; break; @@ -114,8 +90,24 @@ void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, } } +void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } +} + void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, - raw_ostream &O) { + raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; switch (Imm) { case 0: O << "{rn-sae}"; break; @@ -168,21 +160,21 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); - + // If this has a segment register, print it. if (SegReg.getReg()) { printOperand(MI, Op+X86::AddrSegmentReg, O); O << ':'; } - + O << '['; - + bool NeedPlus = false; if (BaseReg.getReg()) { printOperand(MI, Op+X86::AddrBaseReg, O); NeedPlus = true; } - + if (IndexReg.getReg()) { if (NeedPlus) O << " + "; if (ScaleVal != 1) @@ -209,7 +201,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << formatImm(DispVal); } } - + O << ']'; } @@ -257,3 +249,8 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, O << ']'; } + +void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << formatImm(MI->getOperand(Op).getImm() & 0xff); +} diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index d082f0b..2150144 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -36,19 +36,24 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); - void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); - void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "opaque ptr "; printMemReference(MI, OpNo, O); } - + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "byte ptr "; printMemReference(MI, OpNo, O); @@ -152,7 +157,7 @@ public: printMemOffset(MI, OpNo, O); } }; - + } #endif diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index befa6c2..719b761 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -512,7 +512,7 @@ protected: // Defines a new offset for the CFA. E.g. // // With frame: - // + // // pushq %rbp // L0: // .cfi_def_cfa_offset 16 @@ -682,7 +682,7 @@ private: // 4 3 // 5 3 // - for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { + for (unsigned i = 0; i < RegCount; ++i) { int CUReg = getCompactUnwindRegNum(SavedRegs[i]); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; @@ -777,39 +777,6 @@ public: MachO::CPU_TYPE_X86_64, Subtype); } - bool doesSectionRequireSymbols(const MCSection &Section) const override { - // Temporary labels in the string literals sections require symbols. The - // issue is that the x86_64 relocation format does not allow symbol + - // offset, and so the linker does not have enough information to resolve the - // access to the appropriate atom unless an external relocation is used. For - // non-cstring sections, we expect the compiler to use a non-temporary label - // for anything that could have an addend pointing outside the symbol. - // - // See <rdar://problem/4765733>. - const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); - return SMO.getType() == MachO::S_CSTRING_LITERALS; - } - - bool isSectionAtomizable(const MCSection &Section) const override { - const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); - // Fixed sized data sections are uniqued, they cannot be diced into atoms. - switch (SMO.getType()) { - default: - return true; - - case MachO::S_4BYTE_LITERALS: - case MachO::S_8BYTE_LITERALS: - case MachO::S_16BYTE_LITERALS: - case MachO::S_LITERAL_POINTERS: - case MachO::S_NON_LAZY_SYMBOL_POINTERS: - case MachO::S_LAZY_SYMBOL_POINTERS: - case MachO::S_MOD_INIT_FUNC_POINTERS: - case MachO::S_MOD_TERM_FUNC_POINTERS: - case MachO::S_INTERPOSING: - return false; - } - } - /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 365cf0c..d4698bf 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -302,19 +302,21 @@ namespace X86II { //// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, - MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, - MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43, - MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47, - MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51, - MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55, - MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59, - MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63, - MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67, - MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71, - MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75, - MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79, - MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83, - MRM_FF = 84, + MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39, + MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43, + MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47, + MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51, + MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55, + MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59, + MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63, + MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67, + MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71, + MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75, + MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79, + MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83, + MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87, + MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91, + MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95, FormMask = 127, @@ -328,21 +330,28 @@ namespace X86II { OpSizeShift = 7, OpSizeMask = 0x3 << OpSizeShift, - OpSize16 = 1 << OpSizeShift, - OpSize32 = 2 << OpSizeShift, + OpSizeFixed = 0 << OpSizeShift, + OpSize16 = 1 << OpSizeShift, + OpSize32 = 2 << OpSizeShift, - // AsSize - Set if this instruction requires an operand size prefix (0x67), - // which most often indicates that the instruction address 16 bit address - // instead of 32 bit address (or 32 bit address in 64 bit mode). + // AsSize - AdSizeX implies this instruction determines its need of 0x67 + // prefix from a normal ModRM memory operand. The other types indicate that + // an operand is encoded with a specific width and a prefix is needed if + // it differs from the current mode. AdSizeShift = OpSizeShift + 2, - AdSize = 1 << AdSizeShift, + AdSizeMask = 0x3 << AdSizeShift, + + AdSizeX = 1 << AdSizeShift, + AdSize16 = 1 << AdSizeShift, + AdSize32 = 2 << AdSizeShift, + AdSize64 = 3 << AdSizeShift, //===------------------------------------------------------------------===// // OpPrefix - There are several prefix bytes that are used as opcode // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is // no prefix. // - OpPrefixShift = AdSizeShift + 1, + OpPrefixShift = AdSizeShift + 2, OpPrefixMask = 0x7 << OpPrefixShift, // PS, PD - Prefix code for packed single and double precision vector @@ -669,19 +678,10 @@ namespace X86II { return -1; case X86II::MRMDestMem: return 0; - case X86II::MRMSrcMem: { - unsigned FirstMemOp = 1; - if (HasVEX_4V) - ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). - if (HasMemOp4) - ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). - if (HasEVEX_K) - ++FirstMemOp;// Skip the mask register - // FIXME: Maybe lea should have its own form? This is a horrible hack. - //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || - // Opcode == X86::LEA16r || Opcode == X86::LEA32r) - return FirstMemOp; - } + case X86II::MRMSrcMem: + // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a + // mask register. + return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K; case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -692,15 +692,9 @@ namespace X86II { case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - bool HasVEX_4V = TSFlags & X86II::VEX_4V; - unsigned FirstMemOp = 0; - if (HasVEX_4V) - ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). - if (HasEVEX_K) - ++FirstMemOp;// Skip the mask register - return FirstMemOp; - } + case X86II::MRM6m: case X86II::MRM7m: + // Start from 0, skip registers encoded in VEX_VVVV or a mask register. + return 0 + HasVEX_4V + HasEVEX_K; case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: @@ -759,7 +753,7 @@ namespace X86II { (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); } - + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index be6a8e4..e8b0b4c 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -222,6 +222,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, case MCSymbolRefExpr::VK_GOT: Type = ELF::R_386_GOT32; break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_386_PLT32; + break; case MCSymbolRefExpr::VK_GOTOFF: Type = ELF::R_386_GOTOFF; break; diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 5679d63..e64b963 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -108,12 +108,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split - // into two .words. - if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) && - T.getArch() == Triple::x86) - Data64bitsDirective = nullptr; - // Always enable the integrated assembler by default. // Clang also enabled it when the OS is Solaris but that is redundant here. UseIntegratedAssembler = true; @@ -135,9 +129,10 @@ void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; PointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; - ExceptionsType = ExceptionHandling::ItaniumWinEH; + ExceptionsType = ExceptionHandling::WinEH; } AssemblerDialect = AsmWriterFlavor; @@ -155,9 +150,10 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; PointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; - ExceptionsType = ExceptionHandling::ItaniumWinEH; + ExceptionsType = ExceptionHandling::WinEH; } else { ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index f2f06c3..deaad2a 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -23,7 +23,8 @@ namespace llvm { class Triple; class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { - void anchor() override; + virtual void anchor(); + public: explicit X86MCAsmInfoDarwin(const Triple &Triple); }; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 31b8e2d..3ad8ab1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -30,8 +30,8 @@ using namespace llvm; namespace { class X86MCCodeEmitter : public MCCodeEmitter { - X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; + X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; + void operator=(const X86MCCodeEmitter &) = delete; const MCInstrInfo &MCII; MCContext &Ctx; public: @@ -590,6 +590,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { + assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX."); + uint64_t Encoding = TSFlags & X86II::EncodingMask; bool HasEVEX_K = TSFlags & X86II::EVEX_K; bool HasVEX_4V = TSFlags & X86II::VEX_4V; @@ -721,7 +723,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MemAddr, src1(VEX_4V), src2(ModR/M) // MemAddr, src1(ModR/M), imm8 // - if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + X86::AddrBaseReg).getReg())) VEX_B = 0x0; if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + @@ -863,7 +865,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3; } EncodeRC = true; - } + } break; case X86II::MRMDestReg: // MRMDestReg instructions forms: @@ -1109,6 +1111,10 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, : X86II::OpSize16)) EmitByte(0x66, CurByte, OS); + // Emit the LOCK opcode prefix. + if (TSFlags & X86II::LOCK) + EmitByte(0xF0, CurByte, OS); + switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: // 66 EmitByte(0x66, CurByte, OS); @@ -1182,10 +1188,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; - // Emit the lock opcode prefix as needed. - if (TSFlags & X86II::LOCK) - EmitByte(0xF0, CurByte, OS); - // Emit segment override opcode prefix as needed. if (MemoryOperand >= 0) EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg, @@ -1197,16 +1199,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // Emit the address size opcode prefix as needed. bool need_address_override; - // The AdSize prefix is only for 32-bit and 64-bit modes. Hm, perhaps we - // should introduce an AdSize16 bit instead of having seven special cases? - if ((!is16BitMode(STI) && TSFlags & X86II::AdSize) || - (is16BitMode(STI) && (MI.getOpcode() == X86::JECXZ_32 || - MI.getOpcode() == X86::MOV8o8a || - MI.getOpcode() == X86::MOV16o16a || - MI.getOpcode() == X86::MOV32o32a || - MI.getOpcode() == X86::MOV8ao8 || - MI.getOpcode() == X86::MOV16ao16 || - MI.getOpcode() == X86::MOV32ao32))) { + uint64_t AdSize = TSFlags & X86II::AdSizeMask; + if ((is16BitMode(STI) && AdSize == X86II::AdSize32) || + (is32BitMode(STI) && AdSize == X86II::AdSize16) || + (is64BitMode(STI) && AdSize == X86II::AdSize32)) { need_address_override = true; } else if (MemoryOperand < 0) { need_address_override = false; @@ -1430,83 +1426,31 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, break; } case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: - case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: - case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: - case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: - case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: - case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: - case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: - case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: - case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: - case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: - case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: EmitByte(BaseOpcode, CurByte, OS); - unsigned char MRM; - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Invalid Form"); - case X86II::MRM_C0: MRM = 0xC0; break; - case X86II::MRM_C1: MRM = 0xC1; break; - case X86II::MRM_C2: MRM = 0xC2; break; - case X86II::MRM_C3: MRM = 0xC3; break; - case X86II::MRM_C4: MRM = 0xC4; break; - case X86II::MRM_C8: MRM = 0xC8; break; - case X86II::MRM_C9: MRM = 0xC9; break; - case X86II::MRM_CA: MRM = 0xCA; break; - case X86II::MRM_CB: MRM = 0xCB; break; - case X86II::MRM_CF: MRM = 0xCF; break; - case X86II::MRM_D0: MRM = 0xD0; break; - case X86II::MRM_D1: MRM = 0xD1; break; - case X86II::MRM_D4: MRM = 0xD4; break; - case X86II::MRM_D5: MRM = 0xD5; break; - case X86II::MRM_D6: MRM = 0xD6; break; - case X86II::MRM_D7: MRM = 0xD7; break; - case X86II::MRM_D8: MRM = 0xD8; break; - case X86II::MRM_D9: MRM = 0xD9; break; - case X86II::MRM_DA: MRM = 0xDA; break; - case X86II::MRM_DB: MRM = 0xDB; break; - case X86II::MRM_DC: MRM = 0xDC; break; - case X86II::MRM_DD: MRM = 0xDD; break; - case X86II::MRM_DE: MRM = 0xDE; break; - case X86II::MRM_DF: MRM = 0xDF; break; - case X86II::MRM_E0: MRM = 0xE0; break; - case X86II::MRM_E1: MRM = 0xE1; break; - case X86II::MRM_E2: MRM = 0xE2; break; - case X86II::MRM_E3: MRM = 0xE3; break; - case X86II::MRM_E4: MRM = 0xE4; break; - case X86II::MRM_E5: MRM = 0xE5; break; - case X86II::MRM_E8: MRM = 0xE8; break; - case X86II::MRM_E9: MRM = 0xE9; break; - case X86II::MRM_EA: MRM = 0xEA; break; - case X86II::MRM_EB: MRM = 0xEB; break; - case X86II::MRM_EC: MRM = 0xEC; break; - case X86II::MRM_ED: MRM = 0xED; break; - case X86II::MRM_EE: MRM = 0xEE; break; - case X86II::MRM_F0: MRM = 0xF0; break; - case X86II::MRM_F1: MRM = 0xF1; break; - case X86II::MRM_F2: MRM = 0xF2; break; - case X86II::MRM_F3: MRM = 0xF3; break; - case X86II::MRM_F4: MRM = 0xF4; break; - case X86II::MRM_F5: MRM = 0xF5; break; - case X86II::MRM_F6: MRM = 0xF6; break; - case X86II::MRM_F7: MRM = 0xF7; break; - case X86II::MRM_F8: MRM = 0xF8; break; - case X86II::MRM_F9: MRM = 0xF9; break; - case X86II::MRM_FA: MRM = 0xFA; break; - case X86II::MRM_FB: MRM = 0xFB; break; - case X86II::MRM_FC: MRM = 0xFC; break; - case X86II::MRM_FD: MRM = 0xFD; break; - case X86II::MRM_FE: MRM = 0xFE; break; - case X86II::MRM_FF: MRM = 0xFF; break; - } - EmitByte(MRM, CurByte, OS); + uint64_t Form = TSFlags & X86II::FormMask; + EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); break; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5a9181d..0e7b4e5 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -134,18 +134,13 @@ bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, "c" (subleaf)); return false; #elif defined(_MSC_VER) - // __cpuidex was added in MSVC++ 9.0 SP1 - #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif + int registers[4]; + __cpuidex(registers, value, subleaf); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; #else return true; #endif diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index aef9571..d8320b9 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -40,8 +40,8 @@ namespace DWARFFlavour { enum { X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2 }; -} - +} + /// N86 namespace - Native X86 register numbers /// namespace N86 { diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 5685a7f..7a83f4c 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -47,23 +48,21 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue); - void RecordX86_64Relocation(MachObjectWriter *Writer, - const MCAssembler &Asm, + void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, - const MCFixup &Fixup, - MCValue Target, - uint64_t &FixedValue); + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); + public: X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, /*UseAggressiveSymbolFolding=*/Is64Bit) {} - void RecordRelocation(MachObjectWriter *Writer, - const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, - MCValue Target, uint64_t &FixedValue) override { + void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override { if (Writer->is64Bit()) RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); @@ -97,13 +96,10 @@ static unsigned getFixupKindLog2Size(unsigned Kind) { } } -void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, - const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFragment *Fragment, - const MCFixup &Fixup, - MCValue Target, - uint64_t &FixedValue) { +void X86MachObjectWriter::RecordX86_64Relocation( + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind()); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); @@ -117,6 +113,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, unsigned Index = 0; unsigned IsExtern = 0; unsigned Type = 0; + const MCSymbolData *RelSymbol = nullptr; Value = Target.getConstant(); @@ -132,7 +129,6 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (Target.isAbsolute()) { // constant // SymbolNum of 0 indicates the absolute section. Type = MachO::X86_64_RELOC_UNSIGNED; - Index = 0; // FIXME: I believe this is broken, I don't think the linker can understand // it. I think it would require a local relocation, but I'm not sure if that @@ -184,7 +180,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (A->isUndefined() || B->isUndefined()) { StringRef Name = A->isUndefined() ? A->getName() : B->getName(); Asm.getContext().FatalError(Fixup.getLoc(), - "unsupported relocation with subtraction expression, symbol '" + + "unsupported relocation with subtraction expression, symbol '" + Name + "' can not be undefined in a subtraction expression"); } @@ -193,38 +189,30 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, Value -= Writer->getSymbolAddress(&B_SD, Layout) - (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout)); - if (A_Base) { - Index = A_Base->getIndex(); - IsExtern = 1; - } - else { + if (!A_Base) Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; - } Type = MachO::X86_64_RELOC_UNSIGNED; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); - - if (B_Base) { - Index = B_Base->getIndex(); - IsExtern = 1; - } - else { + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); + + if (B_Base) + RelSymbol = B_Base; + else Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; - } Type = MachO::X86_64_RELOC_SUBTRACTOR; } else { const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + if (Symbol->isTemporary() && Value) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Asm.addLocalUsedInReloc(*Symbol); + } const MCSymbolData &SD = Asm.getSymbolData(*Symbol); - const MCSymbolData *Base = Asm.getAtom(&SD); + RelSymbol = Asm.getAtom(&SD); // Relocations inside debug sections always use local relocations when // possible. This seems to be done because the debugger doesn't fully @@ -234,23 +222,20 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, const MCSectionMachO &Section = static_cast<const MCSectionMachO&>( Fragment->getParent()->getSection()); if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) - Base = nullptr; + RelSymbol = nullptr; } // x86_64 almost always uses external relocations, except when there is no // symbol to use as a base address (a local symbol with no preceding // non-local symbol). - if (Base) { - Index = Base->getIndex(); - IsExtern = 1; - + if (RelSymbol) { // Add the local offset, if needed. - if (Base != &SD) - Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); + if (RelSymbol != &SD) + Value += + Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(RelSymbol); } else if (Symbol->isInSection() && !Symbol->isVariable()) { // The index is the section ordinal (1-based). Index = SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; Value += Writer->getSymbolAddress(&SD, Layout); if (IsPCRel) @@ -349,12 +334,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, @@ -426,7 +408,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, (IsPCRel << 30) | MachO::R_SCATTERED); MRE.r_word1 = Value2; - Writer->addRelocation(Fragment->getParent(), MRE); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); } else { // If the offset is more than 24-bits, it won't fit in a scattered // relocation offset field, so we fall back to using a non-scattered @@ -448,7 +430,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, (IsPCRel << 30) | MachO::R_SCATTERED); MRE.r_word1 = Value; - Writer->addRelocation(Fragment->getParent(), MRE); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); return true; } @@ -469,7 +451,6 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, // Get the symbol data. const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); - unsigned Index = SD_A->getIndex(); // We're only going to have a second symbol in pic mode and it'll be a // subtraction from the picbase. For 32-bit pic the addend is the difference @@ -492,12 +473,9 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = Value; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (1 << 27) | // r_extern - (MachO::GENERIC_RELOC_TLV << 28)); // r_type - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28); + Writer->addRelocation(SD_A, Fragment->getParent(), MRE); } void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, @@ -548,8 +526,8 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // See <reloc.h>. uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned Index = 0; - unsigned IsExtern = 0; unsigned Type = 0; + const MCSymbolData *RelSymbol = nullptr; if (Target.isAbsolute()) { // constant // SymbolNum of 0 indicates the absolute section. @@ -570,8 +548,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // Check whether we need an external or internal relocation. if (Writer->doesSymbolRequireExternRelocation(SD)) { - IsExtern = 1; - Index = SD->getIndex(); + RelSymbol = SD; // For external relocations, make sure to offset the fixup value to // compensate for the addend of the symbol address, if it was // undefined. This occurs with weak definitions, for example. @@ -593,12 +570,9 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS, diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 40af822..e1df5c2 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -28,7 +28,8 @@ namespace { virtual ~X86WinCOFFObjectWriter(); unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection) const override; + bool IsCrossSection, + const MCAsmBackend &MAB) const override; }; } @@ -40,7 +41,8 @@ X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection) const { + bool IsCrossSection, + const MCAsmBackend &MAB) const { unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 1ea8798..fceb083 100644 --- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -13,7 +13,7 @@ using namespace llvm; Target llvm::TheX86_32Target, llvm::TheX86_64Target; -extern "C" void LLVMInitializeX86TargetInfo() { +extern "C" void LLVMInitializeX86TargetInfo() { RegisterTarget<Triple::x86, /*HasJIT=*/true> X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index ba6cbc8..a7101e4 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -1,395 +1,434 @@ -//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Define several functions to decode x86 specific shuffle semantics into a -// generic vector mask. -// -//===----------------------------------------------------------------------===// - -#include "X86ShuffleDecode.h" -#include "llvm/IR/Constants.h" -#include "llvm/CodeGen/MachineValueType.h" - -//===----------------------------------------------------------------------===// -// Vector Mask Decoding -//===----------------------------------------------------------------------===// - -namespace llvm { - -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - // Defaults the copying the dest value. - ShuffleMask.push_back(0); - ShuffleMask.push_back(1); - ShuffleMask.push_back(2); - ShuffleMask.push_back(3); - - // Decode the immediate. - unsigned ZMask = Imm & 15; - unsigned CountD = (Imm >> 4) & 3; - unsigned CountS = (Imm >> 6) & 3; - - // CountS selects which input element to use. - unsigned InVal = 4+CountS; - // CountD specifies which element of destination to update. - ShuffleMask[CountD] = InVal; - // ZMask zaps values, potentially overriding the CountD elt. - if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; - if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; - if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; - if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; -} - -// <3,1> or <6,7,2,3> -void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(NElts+i); - - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(i); -} - -// <0,2> or <0,1,4,5> -void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(i); - - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(NElts+i); -} - -void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - for (int i = 0, e = NumElts / 2; i < e; ++i) { - ShuffleMask.push_back(2 * i); - ShuffleMask.push_back(2 * i); - } -} - -void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - for (int i = 0, e = NumElts / 2; i < e; ++i) { - ShuffleMask.push_back(2 * i + 1); - ShuffleMask.push_back(2 * i + 1); - } -} - -void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - unsigned VectorSizeInBits = VT.getSizeInBits(); - unsigned NumElts = VectorSizeInBits / 8; - unsigned NumLanes = VectorSizeInBits / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l < NumElts; l += NumLaneElts) - for (unsigned i = 0; i < NumLaneElts; ++i) { - int M = SM_SentinelZero; - if (i >= Imm) M = i - Imm + l; - ShuffleMask.push_back(M); - } -} - -void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - unsigned VectorSizeInBits = VT.getSizeInBits(); - unsigned NumElts = VectorSizeInBits / 8; - unsigned NumLanes = VectorSizeInBits / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l < NumElts; l += NumLaneElts) - for (unsigned i = 0; i < NumLaneElts; ++i) { - unsigned Base = i + Imm; - int M = Base + l; - if (Base >= NumLaneElts) M = SM_SentinelZero; - ShuffleMask.push_back(M); - } -} - -void DecodePALIGNRMask(MVT VT, unsigned Imm, - SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - unsigned Base = i + Offset; - // if i+offset is out of this lane then we actually need the other source - if (Base >= NumLaneElts) Base += NumElts - NumLaneElts; - ShuffleMask.push_back(Base + l); - } - } -} - -/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. -void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - unsigned NewImm = Imm; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + l); - NewImm /= NumLaneElts; - } - if (NumLaneElts == 4) NewImm = Imm; // reload imm - } -} - -void DecodePSHUFHWMask(MVT VT, unsigned Imm, - SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - for (unsigned l = 0; l != NumElts; l += 8) { - unsigned NewImm = Imm; - for (unsigned i = 0, e = 4; i != e; ++i) { - ShuffleMask.push_back(l + i); - } - for (unsigned i = 4, e = 8; i != e; ++i) { - ShuffleMask.push_back(l + 4 + (NewImm & 3)); - NewImm >>= 2; - } - } -} - -void DecodePSHUFLWMask(MVT VT, unsigned Imm, - SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - for (unsigned l = 0; l != NumElts; l += 8) { - unsigned NewImm = Imm; - for (unsigned i = 0, e = 4; i != e; ++i) { - ShuffleMask.push_back(l + (NewImm & 3)); - NewImm >>= 2; - } - for (unsigned i = 4, e = 8; i != e; ++i) { - ShuffleMask.push_back(l + i); - } - } -} - -/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates -/// the type of the vector allowing it to handle different datatypes and vector -/// widths. -void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElts / NumLanes; - - unsigned NewImm = Imm; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - // each half of a lane comes from different source - for (unsigned s = 0; s != NumElts*2; s += NumElts) { - for (unsigned i = 0; i != NumLaneElts/2; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + s + l); - NewImm /= NumLaneElts; - } - } - if (NumLaneElts == 4) NewImm = Imm; // reload imm - } -} - -/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd -/// and punpckh*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 - } - } -} - -/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// and punpckl*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { - unsigned NumElts = VT.getVectorNumElements(); - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX - unsigned NumLaneElts = NumElts / NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 - } - } -} - -void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, - SmallVectorImpl<int> &ShuffleMask) { - if (Imm & 0x88) - return; // Not a shuffle - - unsigned HalfSize = VT.getVectorNumElements()/2; - - for (unsigned l = 0; l != 2; ++l) { - unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; - for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) - ShuffleMask.push_back(i); - } -} - -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy(8) && - "Expected i8 constant mask elements!"); - int NumElements = MaskTy->getVectorNumElements(); - // FIXME: Add support for AVX-512. - assert((NumElements == 16 || NumElements == 32) && - "Only 128-bit and 256-bit vectors supported!"); - ShuffleMask.reserve(NumElements); - - if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte - // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; - uint64_t Element = CDS->getElementAsInteger(i); - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } - } - } else if (auto *CV = dyn_cast<ConstantVector>(C)) { - assert((unsigned)NumElements == CV->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte - // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } - } - } -} - -void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, - SmallVectorImpl<int> &ShuffleMask) { - for (int i = 0, e = RawMask.size(); i < e; ++i) { - uint64_t M = RawMask[i]; - if (M == (uint64_t)SM_SentinelUndef) { - ShuffleMask.push_back(M); - continue; - } - // For AVX vectors with 32 bytes the base of the shuffle is the half of - // the vector we're inside. - int Base = i < 16 ? 0 : 16; - // If the high bit (7) of the byte is set, the element is zeroed. - if (M & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (M & 0xf); - ShuffleMask.push_back(Index); - } - } -} - -void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - int ElementBits = VT.getScalarSizeInBits(); - int NumElements = VT.getVectorNumElements(); - for (int i = 0; i < NumElements; ++i) { - // If there are more than 8 elements in the vector, then any immediate blend - // mask applies to each 128-bit lane. There can never be more than - // 8 elements in a 128-bit lane with an immediate blend. - int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; - assert(Bit < 8 && - "Immediate blends only operate over 8 elements at a time!"); - ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); - } -} - -/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. -/// No VT provided since it only works on 256-bit, 4 element vectors. -void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back((Imm >> (2*i)) & 3); - } -} - -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy() && - "Expected integer constant mask elements!"); - int ElementBits = MaskTy->getScalarSizeInBits(); - int NumElements = MaskTy->getVectorNumElements(); - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && - "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - uint64_t Element = CDS->getElementAsInteger(i); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } else if (auto *CV = dyn_cast<ConstantVector>(C)) { - assert((unsigned)NumElements == C->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } -} - -} // llvm namespace +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ // Defaults the copying the dest value.
+ ShuffleMask.push_back(0);
+ ShuffleMask.push_back(1);
+ ShuffleMask.push_back(2);
+ ShuffleMask.push_back(3);
+
+ // Decode the immediate.
+ unsigned ZMask = Imm & 15;
+ unsigned CountD = (Imm >> 4) & 3;
+ unsigned CountS = (Imm >> 6) & 3;
+
+ // CountS selects which input element to use.
+ unsigned InVal = 4+CountS;
+ // CountD specifies which element of destination to update.
+ ShuffleMask[CountD] = InVal;
+ // ZMask zaps values, potentially overriding the CountD elt.
+ if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+ if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+ if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+ if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = NElts/2; i != NElts; ++i)
+ ShuffleMask.push_back(NElts+i);
+
+ for (unsigned i = NElts/2; i != NElts; ++i)
+ ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != NElts/2; ++i)
+ ShuffleMask.push_back(i);
+
+ for (unsigned i = 0; i != NElts/2; ++i)
+ ShuffleMask.push_back(NElts+i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i);
+ ShuffleMask.push_back(2 * i);
+ }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ for (int i = 0, e = NumElts / 2; i < e; ++i) {
+ ShuffleMask.push_back(2 * i + 1);
+ ShuffleMask.push_back(2 * i + 1);
+ }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+ unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+ for (unsigned s = 0; s != NumLaneSubElts; s++)
+ ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ int M = SM_SentinelZero;
+ if (i >= Imm) M = i - Imm + l;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned VectorSizeInBits = VT.getSizeInBits();
+ unsigned NumElts = VectorSizeInBits / 8;
+ unsigned NumLanes = VectorSizeInBits / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+ for (unsigned i = 0; i < NumLaneElts; ++i) {
+ unsigned Base = i + Imm;
+ int M = Base + l;
+ if (Base >= NumLaneElts) M = SM_SentinelZero;
+ ShuffleMask.push_back(M);
+ }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + l);
+ NewImm /= NumLaneElts;
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + 4 + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ unsigned NewImm = Imm;
+ for (unsigned i = 0, e = 4; i != e; ++i) {
+ ShuffleMask.push_back(l + (NewImm & 3));
+ NewImm >>= 2;
+ }
+ for (unsigned i = 4, e = 8; i != e; ++i) {
+ ShuffleMask.push_back(l + i);
+ }
+ }
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ unsigned NewImm = Imm;
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ // each half of a lane comes from different source
+ for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+ for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+ ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+ NewImm /= NumLaneElts;
+ }
+ }
+ if (NumLaneElts == 4) NewImm = Imm; // reload imm
+ }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i+NumElts); // Reads from src/src2
+ }
+ }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
+ ShuffleMask.push_back(i); // Reads from dest/src1
+ ShuffleMask.push_back(i+NumElts); // Reads from src/src2
+ }
+ }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Imm & 0x88)
+ return; // Not a shuffle
+
+ unsigned HalfSize = VT.getVectorNumElements()/2;
+
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+ for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+ ShuffleMask.push_back(i);
+ }
+}
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+ if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+ return;
+
+ // This is a straightforward byte vector.
+ if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+ int NumElements = MaskTy->getVectorNumElements();
+ ShuffleMask.reserve(NumElements);
+
+ for (int i = 0; i < NumElements; ++i) {
+ // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+ // lane of the vector we're inside.
+ int Base = i < 16 ? 0 : 16;
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+ }
+ // TODO: Handle funny-looking vectors too.
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ if (M == (uint64_t)SM_SentinelUndef) {
+ ShuffleMask.push_back(M);
+ continue;
+ }
+ // For AVX vectors with 32 bytes the base of the shuffle is the half of
+ // the vector we're inside.
+ int Base = i < 16 ? 0 : 16;
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (M & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (M & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ int ElementBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ for (int i = 0; i < NumElements; ++i) {
+ // If there are more than 8 elements in the vector, then any immediate blend
+ // mask applies to each 128-bit lane. There can never be more than
+ // 8 elements in a 128-bit lane with an immediate blend.
+ int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+ assert(Bit < 8 &&
+ "Immediate blends only operate over 8 elements at a time!");
+ ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+ }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+ for (unsigned i = 0; i != 4; ++i) {
+ ShuffleMask.push_back((Imm >> (2*i)) & 3);
+ }
+}
+
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+ assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+ "Expected integer constant mask elements!");
+ int ElementBits = MaskTy->getScalarSizeInBits();
+ int NumElements = MaskTy->getVectorNumElements();
+ assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ "Unexpected number of vector elements.");
+ ShuffleMask.reserve(NumElements);
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+ assert((unsigned)NumElements == CDS->getNumElements() &&
+ "Constant mask has a different number of elements!");
+
+ for (int i = 0; i < NumElements; ++i) {
+ int Base = (i * ElementBits / 128) * (128 / ElementBits);
+ uint64_t Element = CDS->getElementAsInteger(i);
+ // Only the least significant 2 bits of the integer are used.
+ int Index = Base + (Element & 0x3);
+ ShuffleMask.push_back(Index);
+ }
+ } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+ assert((unsigned)NumElements == C->getNumOperands() &&
+ "Constant mask has a different number of elements!");
+
+ for (int i = 0; i < NumElements; ++i) {
+ int Base = (i * ElementBits / 128) * (128 / ElementBits);
+ Constant *COp = CV->getOperand(i);
+ if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ // Only the least significant 2 bits of the integer are used.
+ int Index = Base + (Element & 0x3);
+ ShuffleMask.push_back(Index);
+ }
+ }
+}
+
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
+ unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+ unsigned Scale = DstScalarBits / SrcScalarBits;
+ assert(SrcScalarBits < DstScalarBits &&
+ "Expected zero extension mask to increase scalar size");
+ assert(SrcVT.getVectorNumElements() >= NumDstElts &&
+ "Too many zero extension lanes");
+
+ for (unsigned i = 0; i != NumDstElts; i++) {
+ Mask.push_back(i);
+ for (unsigned j = 1; j != Scale; j++)
+ Mask.push_back(SM_SentinelZero);
+ }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ ShuffleMask.push_back(0);
+ for (unsigned i = 1; i < NumElts; i++)
+ ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+ // First element comes from the first element of second source.
+ // Remaining elements: Load zero extends / Move copies from first source.
+ unsigned NumElts = VT.getVectorNumElements();
+ Mask.push_back(NumElts);
+ for (unsigned i = 1; i < NumElts; i++)
+ Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 6ba3c64..5c9a8cf 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -1,93 +1,105 @@ -//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Define several functions to decode x86 specific shuffle semantics into a -// generic vector mask. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H -#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H - -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/ArrayRef.h" - -//===----------------------------------------------------------------------===// -// Vector Mask Decoding -//===----------------------------------------------------------------------===// - -namespace llvm { -class Constant; -class MVT; - -enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; - -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -// <3,1> or <6,7,2,3> -void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); - -// <0,2> or <0,1,4,5> -void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); - -void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); - -void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); - -void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates -/// the type of the vector allowing it to handle different datatypes and vector -/// widths. -void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd -/// and punpckh*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); - -/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// and punpckl*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. -void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); - -/// \brief Decode a PSHUFB mask from an IR-level vector constant. -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); - -/// \brief Decode a PSHUFB mask from a raw array of constants such as from -/// BUILD_VECTOR. -void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, - SmallVectorImpl<int> &ShuffleMask); - -/// \brief Decode a BLEND immediate mask into a shuffle mask. -void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, - SmallVectorImpl<int> &ShuffleMask); - -/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. -/// No VT provided since it only works on 256-bit, 4 element vectors. -void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); - -/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); - -} // llvm namespace - -#endif +//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+ SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 8bd5817..8b0a4cf 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -55,9 +55,6 @@ FunctionPass *createX86IssueVZeroUpperPass(); /// FunctionPass *createEmitX86CodeToMemory(); -/// \brief Creates an X86-specific Target Transformation Info pass. -ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); - /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); @@ -67,6 +64,11 @@ FunctionPass *createX86PadShortFunctions(); /// to eliminate execution delays in some Atom processors. FunctionPass *createX86FixupLEAs(); +/// createX86CallFrameOptimization - Return a pass that optimizes +/// the code-size of x86 call sequences. This is done by replacing +/// esp-relative movs with pushes. +FunctionPass *createX86CallFrameOptimization(); + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 83f55d3..4f9836d 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -79,9 +79,16 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; +// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that +// explicit. Also, it seems this would be the default state for most chips +// going forward, so it would probably be better to negate the logic and +// match the 32-byte "slow mem" feature below. def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "IsUAMemFast", "true", "Fast unaligned memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -125,9 +132,9 @@ def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", "Enable XOP instructions", [FeatureFMA4]>; -def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", - "HasVectorUAMem", "true", - "Allow unaligned memory operands on vector/SIMD instructions">; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; @@ -157,19 +164,18 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; -def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", - "Support SGX instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; -def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true", - "Support SMAP instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; -def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", - "HasSlowDivide", "true", - "Use small divide for positive values less than 256">; +def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", + "HasSlowDivide32", "true", + "Use 8-bit divide for positive values less than 256">; +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", + "HasSlowDivide64", "true", + "Use 16-bit divide for positive values less than 65536">; def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; @@ -230,86 +236,166 @@ def : ProcessorModel<"core2", SandyBridgeModel, def : ProcessorModel<"penryn", SandyBridgeModel, [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -// Atom. -def : ProcessorModel<"atom", AtomModel, - [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, - FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, - FeatureSlowDivide, - FeatureCallRegIndirect, - FeatureLEAUsesAG, - FeaturePadShortFunctions]>; - -// Atom Silvermont. -def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, - FeatureSSE42, FeatureCMPXCHG16B, - FeatureMOVBE, FeaturePOPCNT, - FeaturePCLMUL, FeatureAES, - FeatureCallRegIndirect, - FeaturePRFCHW, - FeatureSlowLEA, FeatureSlowIncDec, - FeatureSlowBTMem, FeatureFastUAMem]>; +// Atom CPUs. +class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ + ProcIntelAtom, + FeatureSSSE3, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLeaForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions + ]>; +def : BonnellProc<"bonnell">; +def : BonnellProc<"atom">; // Pin the generic name to the baseline. + +class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ + ProcIntelSLM, + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureFastUAMem + ]>; +def : SilvermontProc<"silvermont">; +def : SilvermontProc<"slm">; // Legacy alias. + // "Arrandale" along with corei3 and corei5 -def : ProcessorModel<"corei7", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>; +class NehalemProc<string Name, list<SubtargetFeature> AdditionalFeatures> + : ProcessorModel<Name, SandyBridgeModel, !listconcat([ + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureFastUAMem, + FeaturePOPCNT + ], + AdditionalFeatures)>; +def : NehalemProc<"nehalem", []>; +def : NehalemProc<"corei7", [FeatureAES]>; -def : ProcessorModel<"nehalem", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : ProcessorModel<"westmere", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES, - FeaturePCLMUL]>; -// Sandy Bridge +class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL + ]>; +def : WestmereProc<"westmere">; + // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. -def : ProcessorModel<"corei7-avx", SandyBridgeModel, - [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; -// Ivy Bridge -def : ProcessorModel<"core-avx-i", SandyBridgeModel, - [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase]>; - -// Haswell -def : ProcessorModel<"core-avx2", HaswellModel, - [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, - FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE, FeatureSlowIncDec]>; - -// Broadwell -def : ProcessorModel<"broadwell", HaswellModel, - [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, - FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP, - FeatureSlowIncDec]>; -// KNL +class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureAVX, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL + ]>; +def : SandyBridgeProc<"sandybridge">; +def : SandyBridgeProc<"corei7-avx">; // Legacy alias. + +class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureAVX, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase + ]>; +def : IvyBridgeProc<"ivybridge">; +def : IvyBridgeProc<"core-avx-i">; // Legacy alias. + +class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureAVX2, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec + ]>; +def : HaswellProc<"haswell">; +def : HaswellProc<"core-avx2">; // Legacy alias. + +class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureAVX2, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec + ]>; +def : BroadwellProc<"broadwell">; + // FIXME: define KNL model -def : ProcessorModel<"knl", HaswellModel, +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, FeatureSlowIncDec]>; +def : KnightsLandingProc<"knl">; -// SKX // FIXME: define SKX model -def : ProcessorModel<"skx", HaswellModel, +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [FeatureAVX512, FeatureCDI, FeatureDQI, FeatureBWI, FeatureVLX, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureSGX]>; + FeatureSlowIncDec]>; +def : SkylakeProc<"skylake">; +def : SkylakeProc<"skx">; // Legacy alias. + + +// AMD CPUs. def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -318,7 +404,7 @@ def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; @@ -342,6 +428,10 @@ def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"barcelona", [FeatureSSE4A, + Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem, + FeatureSlowSHLD]>; // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, @@ -352,8 +442,10 @@ def : ProcessorModel<"btver2", BtVer2Model, [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, - FeatureUseSqrtEst, FeatureUseRecipEst]>; + FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, + FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + +// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, @@ -394,7 +486,7 @@ def : Proc<"c3-2", [FeatureSSE1]>; // be good for modern chips without enabling instruction set encodings past the // basic SSE2 and 64-bit ones. It disables slow things from any mainstream and // modern 64-bit x86 chip, and enables features that are generally beneficial. -// +// // We currently use the Sandy Bridge model as the default scheduling model as // we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which // covers a huge swath of x86 processors. If there are specific scheduling diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 4e5b7b8..bb0b9ce 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -47,6 +47,8 @@ using namespace llvm; /// runOnMachineFunction - Emit the function body. /// bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &MF.getSubtarget<X86Subtarget>(); + SMShadowTracker.startFunction(MF); SetupMachineFunction(MF); @@ -505,13 +507,15 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { - if (Subtarget->isTargetMacho()) + Triple TT(TM.getTargetTriple()); + + if (TT.isOSBinFormatMachO()) OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); - if (Subtarget->isTargetCOFF()) { + if (TT.isOSBinFormatCOFF()) { // Emit an absolute @feat.00 symbol. This appears to be some kind of // compiler features bitfield read by link.exe. - if (!Subtarget->is64Bit()) { + if (TT.getArch() == Triple::x86) { MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00")); OutStreamer.BeginCOFFSymbolDef(S); OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); @@ -558,8 +562,7 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = - CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout()); + SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); const Constant *C = CPE.Val.ConstVal; if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( getObjFileLowering().getSectionForConstant(Kind, C))) { @@ -579,20 +582,21 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { SmallString<128> Directive; raw_svector_ostream OS(Directive); StringRef Name = Sym->getName(); + Triple TT(TM.getTargetTriple()); - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) OS << " /EXPORT:"; else OS << " -export:"; - if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) && + if ((TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) && (Name[0] == getDataLayout().getGlobalPrefix())) Name = Name.drop_front(); OS << Name; if (IsData) { - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) OS << ",DATA"; else OS << ",data"; @@ -603,10 +607,12 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { } void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetMacho()) { + Triple TT(TM.getTargetTriple()); + + if (TT.isOSBinFormatMachO()) { // All darwin targets use mach-o. MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MMI->getObjFileInfo<MachineModuleInfoMachO>(); // Output stubs for dynamically-linked functions. MachineModuleInfoMachO::SymbolListTy Stubs; @@ -677,22 +683,23 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } - if (Subtarget->isTargetKnownWindowsMSVC() && MMI->usesVAFloatArgument()) { - StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; + if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) { + StringRef SymbolName = + (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused"; MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); OutStreamer.EmitSymbolAttribute(S, MCSA_Global); } - if (Subtarget->isTargetCOFF()) { + if (TT.isOSBinFormatCOFF()) { // Necessary for dllexport support std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; for (const auto &Function : M) - if (Function.hasDLLExportStorageClass()) + if (Function.hasDLLExportStorageClass() && !Function.isDeclaration()) DLLExportedFns.push_back(getSymbol(&Function)); for (const auto &Global : M.globals()) - if (Global.hasDLLExportStorageClass()) + if (Global.hasDLLExportStorageClass() && !Global.isDeclaration()) DLLExportedGlobals.push_back(getSymbol(&Global)); for (const auto &Alias : M.aliases()) { @@ -719,7 +726,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } - if (Subtarget->isTargetELF()) { + if (TT.isOSBinFormatELF()) { const TargetLoweringObjectFileELF &TLOFELF = static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); @@ -729,7 +736,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); + const DataLayout *TD = TM.getDataLayout(); for (const auto &Stub : Stubs) { OutStreamer.EmitLabel(Stub.first); diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 748b948..d101b8c 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -57,6 +57,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); private: TargetMachine &TM; + const MachineFunction *MF; std::unique_ptr<MCCodeEmitter> CodeEmitter; bool InShadow; @@ -85,10 +86,9 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); public: - explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); - } + explicit X86AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this), SMShadowTracker(TM) {} const char *getPassName() const override { return "X86 Assembly / Object Emitter"; diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp new file mode 100644 index 0000000..5e8d374 --- /dev/null +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -0,0 +1,480 @@ +//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pass that optimizes call sequences on x86. +// Currently, it converts movs of function parameters onto the stack into +// pushes. This is beneficial for two main reasons: +// 1) The push instruction encoding is much smaller than an esp-relative mov +// 2) It is possible to push memory arguments directly. So, if the +// the transformation is preformed pre-reg-alloc, it can help relieve +// register pressure. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-cf-opt" + +static cl::opt<bool> + NoX86CFOpt("no-x86-call-frame-opt", + cl::desc("Avoid optimizing x86 call frames for size"), + cl::init(false), cl::Hidden); + +namespace { +class X86CallFrameOptimization : public MachineFunctionPass { +public: + X86CallFrameOptimization() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + // Information we know about a particular call site + struct CallContext { + CallContext() + : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; + + // Actuall call instruction + MachineInstr *Call; + + // A copy of the stack pointer + MachineInstr *SPCopy; + + // The total displacement of all passed parameters + int64_t ExpectedDist; + + // The sequence of movs used to pass the parameters + SmallVector<MachineInstr *, 4> MovVector; + + // True if this call site has no stack parameters + bool NoStackParams; + + // True of this callsite can use push instructions + bool UsePush; + }; + + typedef DenseMap<MachineInstr *, CallContext> ContextMap; + + bool isLegal(MachineFunction &MF); + + bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + + void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, CallContext &Context); + + bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I, + const CallContext &Context); + + MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, + unsigned Reg); + + const char *getPassName() const override { return "X86 Optimize Call Frame"; } + + const TargetInstrInfo *TII; + const TargetFrameLowering *TFL; + const MachineRegisterInfo *MRI; + static char ID; +}; + +char X86CallFrameOptimization::ID = 0; +} + +FunctionPass *llvm::createX86CallFrameOptimization() { + return new X86CallFrameOptimization(); +} + +// This checks whether the transformation is legal. +// Also returns false in cases where it's potentially legal, but +// we don't even want to try. +bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { + if (NoX86CFOpt.getValue()) + return false; + + // We currently only support call sequences where *all* parameters. + // are passed on the stack. + // No point in running this in 64-bit mode, since some arguments are + // passed in-register in all common calling conventions, so the pattern + // we're looking for will never match. + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.is64Bit()) + return false; + + // You would expect straight-line code between call-frame setup and + // call-frame destroy. You would be wrong. There are circumstances (e.g. + // CMOV_GR8 expansion of a select that feeds a function call!) where we can + // end up with the setup and the destroy in different basic blocks. + // This is bad, and breaks SP adjustment. + // So, check that all of the frames in the function are closed inside + // the same block, and, for good measure, that there are no nested frames. + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + for (MachineBasicBlock &BB : MF) { + bool InsideFrameSequence = false; + for (MachineInstr &MI : BB) { + if (MI.getOpcode() == FrameSetupOpcode) { + if (InsideFrameSequence) + return false; + InsideFrameSequence = true; + } else if (MI.getOpcode() == FrameDestroyOpcode) { + if (!InsideFrameSequence) + return false; + InsideFrameSequence = false; + } + } + + if (InsideFrameSequence) + return false; + } + + return true; +} + +// Check whether this trasnformation is profitable for a particular +// function - in terms of code size. +bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, + ContextMap &CallSeqMap) { + // This transformation is always a win when we do not expect to have + // a reserved call frame. Under other circumstances, it may be either + // a win or a loss, and requires a heuristic. + bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); + if (CannotReserveFrame) + return true; + + // Don't do this when not optimizing for size. + bool OptForSize = + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || + MF.getFunction()->hasFnAttribute(Attribute::MinSize); + + if (!OptForSize) + return false; + + + unsigned StackAlign = TFL->getStackAlignment(); + + int64_t Advantage = 0; + for (auto CC : CallSeqMap) { + // Call sites where no parameters are passed on the stack + // do not affect the cost, since there needs to be no + // stack adjustment. + if (CC.second.NoStackParams) + continue; + + if (!CC.second.UsePush) { + // If we don't use pushes for a particular call site, + // we pay for not having a reserved call frame with an + // additional sub/add esp pair. The cost is ~3 bytes per instruction, + // depending on the size of the constant. + // TODO: Callee-pop functions should have a smaller penalty, because + // an add is needed even with a reserved call frame. + Advantage -= 6; + } else { + // We can use pushes. First, account for the fixed costs. + // We'll need a add after the call. + Advantage -= 3; + // If we have to realign the stack, we'll also need and sub before + if (CC.second.ExpectedDist % StackAlign) + Advantage -= 3; + // Now, for each push, we save ~3 bytes. For small constants, we actually, + // save more (up to 5 bytes), but 3 should be a good approximation. + Advantage += (CC.second.ExpectedDist / 4) * 3; + } + } + + return (Advantage >= 0); +} + + +bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget().getInstrInfo(); + TFL = MF.getSubtarget().getFrameLowering(); + MRI = &MF.getRegInfo(); + + if (!isLegal(MF)) + return false; + + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + + bool Changed = false; + + ContextMap CallSeqMap; + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode) { + CallContext &Context = CallSeqMap[I]; + collectCallInfo(MF, *BB, I, Context); + } + + if (!isProfitable(MF, CallSeqMap)) + return false; + + for (auto CC : CallSeqMap) + if (CC.second.UsePush) + Changed |= adjustCallSequence(MF, CC.first, CC.second); + + return Changed; +} + +void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + CallContext &Context) { + // Check that this particular call sequence is amenable to the + // transformation. + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // We expect to enter this at the beginning of a call sequence + assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + MachineBasicBlock::iterator FrameSetup = I++; + + // How much do we adjust the stack? This puts an upper bound on + // the number of parameters actually passed on it. + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + + // A zero adjustment means no stack parameters + if (!MaxAdjust) { + Context.NoStackParams = true; + return; + } + + // For globals in PIC mode, we can have some LEAs here. + // Ignore them, they don't bother us. + // TODO: Extend this to something that covers more cases. + while (I->getOpcode() == X86::LEA32r) + ++I; + + // We expect a copy instruction here. + // TODO: The copy instruction is a lowering artifact. + // We should also support a copy-less version, where the stack + // pointer is used directly. + if (!I->isCopy() || !I->getOperand(0).isReg()) + return; + Context.SPCopy = I++; + StackPtr = Context.SPCopy->getOperand(0).getReg(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps between them. + if (MaxAdjust > 4) + Context.MovVector.resize(MaxAdjust, nullptr); + + do { + int Opcode = I->getOpcode(); + if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) + break; + + // We only want movs of the form: + // movl imm/r32, k(%esp) + // If we run into something else, bail. + // Note that AddrBaseReg may, counter to its name, not be a register, + // but rather a frame index. + // TODO: Support the fi case. This should probably work now that we + // have the infrastructure to track the stack pointer within a call + // sequence. + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + assert(StackDisp >= 0 && + "Negative stack displacement when passing parameters"); + + // We really don't want to consider the unaligned case. + if (StackDisp % 4) + return; + StackDisp /= 4; + + assert((size_t)StackDisp < Context.MovVector.size() && + "Function call has more parameters than the stack is adjusted for."); + + // If the same stack slot is being filled twice, something's fishy. + if (Context.MovVector[StackDisp] != nullptr) + return; + Context.MovVector[StackDisp] = I; + + ++I; + } while (I != MBB.end()); + + // We now expect the end of the sequence - a call and a stack adjust. + if (I == MBB.end()) + return; + + // For PCrel calls, we expect an additional COPY of the basereg. + // If we find one, skip it. + if (I->isCopy()) { + if (I->getOperand(1).getReg() == + MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) + ++I; + else + return; + } + + if (!I->isCall()) + return; + + Context.Call = I; + if ((++I)->getOpcode() != FrameDestroyOpcode) + return; + + // Now, go through the vector, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end(); + for (; MMI != MME; ++MMI, Context.ExpectedDist += 4) + if (*MMI == nullptr) + break; + + // If the call had no parameters, do nothing + if (MMI == Context.MovVector.begin()) + return; + + // We are either at the last parameter, or a gap. + // Make sure it's not a gap + for (; MMI != MME; ++MMI) + if (*MMI != nullptr) + return; + + Context.UsePush = true; + return; +} + +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, + MachineBasicBlock::iterator I, + const CallContext &Context) { + // Ok, we can in fact do the transformation for this call. + // Do not remove the FrameSetup instruction, but adjust the parameters. + // PEI will end up finalizing the handling of this. + MachineBasicBlock::iterator FrameSetup = I; + MachineBasicBlock &MBB = *(I->getParent()); + FrameSetup->getOperand(1).setImm(Context.ExpectedDist); + + DebugLoc DL = I->getDebugLoc(); + // Now, iterate through the vector in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // replace uses. + for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { + MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + if (MOV->getOpcode() == X86::MOV32mi) { + unsigned PushOpcode = X86::PUSHi32; + // If the operand is a small (8-bit) immediate, we can use a + // PUSH instruction with a shorter encoding. + // Note that isImm() may fail even though this is a MOVmi, because + // the operand can also be a symbol. + if (PushOp.isImm()) { + int64_t Val = PushOp.getImm(); + if (isInt<8>(Val)) + PushOpcode = X86::PUSH32i8; + } + BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + } else { + unsigned int Reg = PushOp.getReg(); + + // If PUSHrmm is not slow on this target, try to fold the source of the + // push into the instruction. + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); + bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + + // Check that this is legal to fold. Right now, we're extremely + // conservative about that. + MachineInstr *DefMov = nullptr; + if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { + MachineInstr *Push = + BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); + + unsigned NumOps = DefMov->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + Push->addOperand(DefMov->getOperand(i)); + + DefMov->eraseFromParent(); + } else { + BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + .addReg(Reg) + .getInstr(); + } + } + + MBB.erase(MOV); + } + + // The stack-pointer copy is no longer used in the call sequences. + // There should not be any other users, but we can't commit to that, so: + if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg())) + Context.SPCopy->eraseFromParent(); + + // Once we've done this, we need to make sure PEI doesn't assume a reserved + // frame. + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + FuncInfo->setHasPushSequences(true); + + return true; +} + +MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( + MachineBasicBlock::iterator FrameSetup, unsigned Reg) { + // Do an extremely restricted form of load folding. + // ISel will often create patterns like: + // movl 4(%edi), %eax + // movl 8(%edi), %ecx + // movl 12(%edi), %edx + // movl %edx, 8(%esp) + // movl %ecx, 4(%esp) + // movl %eax, (%esp) + // call + // Get rid of those with prejudice. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + + // Make sure this is the only use of Reg. + if (!MRI->hasOneNonDBGUse(Reg)) + return nullptr; + + MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); + + // Make sure the def is a MOV from memory. + // If the def is an another block, give up. + if (DefMI->getOpcode() != X86::MOV32rm || + DefMI->getParent() != FrameSetup->getParent()) + return nullptr; + + // Now, make sure everything else up until the ADJCALLSTACK is a sequence + // of MOVs. To be less conservative would require duplicating a lot of the + // logic from PeepholeOptimizer. + // FIXME: A possibly better approach would be to teach the PeepholeOptimizer + // to be smarter about folding into pushes. + for (auto I = DefMI; I != FrameSetup; ++I) + if (I->getOpcode() != X86::MOV32rm) + return nullptr; + + return DefMI; +} diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 75a2ec0..41c759a 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[ CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + // The first 4 AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>, + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, @@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToStack<32, 32>>, + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>>, + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are // passed in the parameter area. CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; @@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v16f32, v8f64, v16i32, v8i64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + // Pass masks in mask registers + CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, CCDelegateTo<CC_X86_32_C> diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 95cb718..a17f052 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -58,8 +59,8 @@ class X86FastISel final : public FastISel { public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) - : FastISel(funcInfo, libInfo) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); + : FastISel(funcInfo, libInfo) { + Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); } @@ -80,7 +81,7 @@ public: #include "X86GenFastISel.inc" private: - bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg); @@ -123,11 +124,15 @@ private: bool X86SelectTrunc(const Instruction *I); + bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc, + const TargetRegisterClass *RC); + bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); + bool X86SelectSIToFP(const Instruction *I); const X86InstrInfo *getInstrInfo() const { - return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); + return Subtarget->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast<const X86TargetMachine *>(&TM); @@ -137,7 +142,7 @@ private: unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); - unsigned X86MaterializeGV(const GlobalValue *GV,MVT VT); + unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); unsigned fastMaterializeConstant(const Constant *C) override; unsigned fastMaterializeAlloca(const AllocaInst *C) override; @@ -544,7 +549,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); unsigned LoadReg; if (I != LocalValueMap.end() && I->second != 0) { LoadReg = I->second; @@ -655,7 +660,7 @@ redo_gep: case Instruction::Alloca: { // Do static allocas. const AllocaInst *A = cast<AllocaInst>(V); - DenseMap<const AllocaInst*, int>::iterator SI = + DenseMap<const AllocaInst *, int>::iterator SI = FuncInfo.StaticAllocaMap.find(A); if (SI != FuncInfo.StaticAllocaMap.end()) { AM.BaseType = X86AddressMode::FrameIndexBase; @@ -903,7 +908,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { unsigned Alignment = S->getAlignment(); unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); - if (Alignment == 0) // Ensure that codegen never sees alignment 0 + if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = ABIAlignment; bool Aligned = Alignment >= ABIAlignment; @@ -1009,12 +1014,12 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Make the copy. unsigned DstReg = VA.getLocReg(); - const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - DstReg).addReg(SrcReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); // Add register to return instruction. RetRegs.push_back(VA.getLocReg()); @@ -1030,14 +1035,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - RetReg).addReg(Reg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); RetRegs.push_back(RetReg); } // Now emit the RET. MachineInstrBuilder MIB = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) MIB.addReg(RetRegs[i], RegState::Implicit); return true; @@ -1108,7 +1114,7 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { } bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, - EVT VT) { + EVT VT, DebugLoc CurDbgLoc) { unsigned Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; @@ -1121,7 +1127,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, // CMPri, otherwise use CMPrr. if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) .addReg(Op0Reg) .addImm(Op1C->getSExtValue()); return true; @@ -1133,7 +1139,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, unsigned Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) .addReg(Op0Reg) .addReg(Op1Reg); @@ -1201,7 +1207,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { ResultReg = createResultReg(&X86::GR8RegClass); if (SETFOpc) { - if (!X86FastEmitCompare(LHS, RHS, VT)) + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); @@ -1226,7 +1232,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { std::swap(LHS, RHS); // Emit a compare of LHS/RHS. - if (!X86FastEmitCompare(LHS, RHS, VT)) + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -1284,7 +1290,6 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { return true; } - bool X86FastISel::X86SelectBranch(const Instruction *I) { // Unconditional branches are selected by tablegen-generated code. // Handle a conditional branch. @@ -1353,7 +1358,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) @@ -1362,7 +1367,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) .addMBB(TrueMBB); } @@ -1399,10 +1404,10 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); - unsigned JmpOpc = X86::JNE_4; + unsigned JmpOpc = X86::JNE_1; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); - JmpOpc = X86::JE_4; + JmpOpc = X86::JE_1; } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) @@ -1444,7 +1449,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg).addImm(1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; @@ -1632,8 +1637,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { TII.get(X86::MOV32r0), Zero32); // Copy the zero into the appropriate sub/super/identical physical - // register. Unfortunately the operations needed are not uniform enough to - // fit neatly into the table above. + // register. Unfortunately the operations needed are not uniform enough + // to fit neatly into the table above. if (VT.SimpleTy == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) @@ -1740,8 +1745,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { EVT CmpVT = TLI.getValueType(CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) - return false; + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) + return false; if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); @@ -1820,7 +1825,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (I->getType() != CI->getOperand(0)->getType() || !((Subtarget->hasSSE1() && RetVT == MVT::f32) || - (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + (Subtarget->hasSSE2() && RetVT == MVT::f64))) return false; const Value *CmpLHS = CI->getOperand(0); @@ -1924,7 +1929,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(CmpLHS->getType()); - if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; } else { unsigned CondReg = getRegForValue(Cond); @@ -2001,41 +2006,91 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { return false; } +bool X86FastISel::X86SelectSIToFP(const Instruction *I) { + if (!I->getOperand(0)->getType()->isIntegerTy(32)) + return false; + + // Select integer to float/double conversion. + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + bool HasAVX = Subtarget->hasAVX(); + const TargetRegisterClass *RC = nullptr; + unsigned Opcode; + + if (I->getType()->isDoubleTy() && X86ScalarSSEf64) { + // sitofp int -> double + Opcode = HasAVX ? X86::VCVTSI2SDrr : X86::CVTSI2SDrr; + RC = &X86::FR64RegClass; + } else if (I->getType()->isFloatTy() && X86ScalarSSEf32) { + // sitofp int -> float + Opcode = HasAVX ? X86::VCVTSI2SSrr : X86::CVTSI2SSrr; + RC = &X86::FR32RegClass; + } else + return false; + + + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + const MCInstrDesc &II = TII.get(Opcode); + OpReg = constrainOperandRegClass(II, OpReg, (HasAVX ? 2 : 1)); + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg); + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg, RegState::Kill); + MIB.addReg(OpReg); + updateValueMap(I, ResultReg); + return true; +} + +// Helper method used by X86SelectFPExt and X86SelectFPTrunc. +bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, + unsigned TargetOpc, + const TargetRegisterClass *RC) { + assert((I->getOpcode() == Instruction::FPExt || + I->getOpcode() == Instruction::FPTrunc) && + "Instruction must be an FPExt or FPTrunc!"); + + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), + ResultReg); + if (Subtarget->hasAVX()) + MIB.addReg(OpReg); + MIB.addReg(OpReg); + updateValueMap(I, ResultReg); + return true; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { - // fpext from float to double. - if (X86ScalarSSEf64 && - I->getType()->isDoubleTy()) { - const Value *V = I->getOperand(0); - if (V->getType()->isFloatTy()) { - unsigned OpReg = getRegForValue(V); - if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(&X86::FR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::CVTSS2SDrr), ResultReg) - .addReg(OpReg); - updateValueMap(I, ResultReg); - return true; - } + if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && + I->getOperand(0)->getType()->isFloatTy()) { + // fpext from float to double. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass); } return false; } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { - if (X86ScalarSSEf64) { - if (I->getType()->isFloatTy()) { - const Value *V = I->getOperand(0); - if (V->getType()->isDoubleTy()) { - unsigned OpReg = getRegForValue(V); - if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(&X86::FR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::CVTSD2SSrr), ResultReg) - .addReg(OpReg); - updateValueMap(I, ResultReg); - return true; - } - } + if (X86ScalarSSEf64 && I->getType()->isFloatTy() && + I->getOperand(0)->getType()->isDoubleTy()) { + // fptrunc from double to float. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass); } return false; @@ -2065,12 +2120,11 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { if (!Subtarget->is64Bit()) { // If we're on x86-32; we can't extract an i8 from a general register. // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ? - (const TargetRegisterClass*)&X86::GR16_ABCDRegClass : - (const TargetRegisterClass*)&X86::GR32_ABCDRegClass; + const TargetRegisterClass *CopyRC = + (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - CopyReg).addReg(InputReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); InputReg = CopyReg; } @@ -2107,9 +2161,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, VT = MVT::i32; else if (Len >= 2) VT = MVT::i16; - else { + else VT = MVT::i8; - } unsigned Reg; bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); @@ -2129,7 +2182,73 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. switch (II->getIntrinsicID()) { default: return false; + case Intrinsic::convert_from_fp16: + case Intrinsic::convert_to_fp16: { + if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) + return false; + + const Value *Op = II->getArgOperand(0); + unsigned InputReg = getRegForValue(Op); + if (InputReg == 0) + return false; + + // F16C only allows converting from float to half and from half to float. + bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16; + if (IsFloatToHalf) { + if (!Op->getType()->isFloatTy()) + return false; + } else { + if (!II->getType()->isFloatTy()) + return false; + } + + unsigned ResultReg = 0; + const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16); + if (IsFloatToHalf) { + // 'InputReg' is implicitly promoted from register class FR32 to + // register class VR128 by method 'constrainOperandRegClass' which is + // directly called by 'fastEmitInst_ri'. + // Instruction VCVTPS2PHrr takes an extra immediate operand which is + // used to provide rounding control. + InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0); + + // Move the lower 32-bits of ResultReg to another register of class GR32. + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::VMOVPDI2DIrr), ResultReg) + .addReg(InputReg, RegState::Kill); + + // The result value is in the lower 16-bits of ResultReg. + unsigned RegIdx = X86::sub_16bit; + ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); + } else { + assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); + // Explicitly sign-extend the input to 32-bit. + InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, + /*Kill=*/false); + + // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. + InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, + InputReg, /*Kill=*/true); + + InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); + + // The result value is in the lower 32-bits of ResultReg. + // Emit an explicit copy from register class VR128 to register class FR32. + ResultReg = createResultReg(&X86::FR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(InputReg, RegState::Kill); + } + + updateValueMap(II, ResultReg); + return true; + } case Intrinsic::frameaddress: { + MachineFunction *MF = FuncInfo.MF; + if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI()) + return false; + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; @@ -2145,14 +2264,13 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; } - // This needs to be set before we call getFrameRegister, otherwise we get - // the wrong frame register. - MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + // This needs to be set before we call getPtrSizedFrameRegister, otherwise + // we get the wrong frame register. + MachineFrameInfo *MFI = MF->getFrameInfo(); MFI->setFrameAddressIsTaken(true); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -2372,19 +2490,16 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned ResultReg = 0; // Check if we have an immediate version. if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { - static const unsigned Opc[2][2][4] = { - { { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, - { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } }, - { { X86::INC8r, X86::INC64_16r, X86::INC64_32r, X86::INC64r }, - { X86::DEC8r, X86::DEC64_16r, X86::DEC64_32r, X86::DEC64r } } + static const unsigned Opc[2][4] = { + { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, + { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } }; if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { ResultReg = createResultReg(TLI.getRegClassFor(VT)); - bool Is64Bit = Subtarget->is64Bit(); bool IsDec = BaseOpc == X86ISD::DEC; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc[Is64Bit][IsDec][VT.SimpleTy-MVT::i8]), ResultReg) + TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) .addReg(LHSReg, getKillRegState(LHSIsKill)); } else ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, @@ -2529,7 +2644,7 @@ bool X86FastISel::fastLowerArguments() { if (!Subtarget->is64Bit()) return false; - + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. unsigned GPRCnt = 0; unsigned FPRCnt = 0; @@ -2674,6 +2789,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { TM.Options.GuaranteedTailCallOpt)) return false; + SmallVector<MVT, 16> OutVTs; + SmallVector<unsigned, 16> ArgRegs; + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra // instruction. This is safe because it is common to all FastISel supported // calling conventions on x86. @@ -2691,28 +2809,34 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". - if (auto *TI = dyn_cast<TruncInst>(Val)) { - if (TI->getType()->isIntegerTy(1) && CLI.CS && - (TI->getParent() == CLI.CS->getInstruction()->getParent()) && - TI->hasOneUse()) { - Val = cast<TruncInst>(Val)->getOperand(0); - unsigned ResultReg = getRegForValue(Val); - - if (!ResultReg) - return false; - - MVT ArgVT; - if (!isTypeLegal(Val->getType(), ArgVT)) - return false; + MVT VT; + auto *TI = dyn_cast<TruncInst>(Val); + unsigned ResultReg; + if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Value *PrevVal = TI->getOperand(0); + ResultReg = getRegForValue(PrevVal); + + if (!ResultReg) + return false; - ResultReg = - fastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1); + if (!isTypeLegal(PrevVal->getType(), VT)) + return false; - if (!ResultReg) - return false; - updateValueMap(Val, ResultReg); - } + ResultReg = + fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); + } else { + if (!isTypeLegal(Val->getType(), VT)) + return false; + ResultReg = getRegForValue(Val); } + + if (!ResultReg) + return false; + + ArgRegs.push_back(ResultReg); + OutVTs.push_back(VT); } // Analyze operands of the call, assigning locations to each operand. @@ -2723,13 +2847,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsWin64) CCInfo.AllocateStack(32, 8); - SmallVector<MVT, 16> OutVTs; - for (auto *Val : OutVals) { - MVT VT; - if (!isTypeLegal(Val->getType(), VT)) - return false; - OutVTs.push_back(VT); - } CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. @@ -2738,11 +2855,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign const &VA = ArgLocs[i]; const Value *ArgVal = OutVals[VA.getValNo()]; @@ -2751,9 +2867,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (ArgVT == MVT::x86mmx) return false; - unsigned ArgReg = getRegForValue(ArgVal); - if (!ArgReg) - return false; + unsigned ArgReg = ArgRegs[VA.getValNo()]; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -2875,7 +2989,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), @@ -3049,6 +3163,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); + case Instruction::SIToFP: + return X86SelectSIToFP(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); @@ -3194,8 +3310,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align); + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + TM.getDataLayout()->getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } @@ -3229,7 +3345,10 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { ResultReg) .addGlobalAddress(GV); } else { - unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); } @@ -3271,7 +3390,10 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; - unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3325,7 +3447,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, if (!X86SelectAddress(Ptr, AM)) return false; - const X86InstrInfo &XII = (const X86InstrInfo&)TII; + const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 02736ac..b39c5ab 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -88,7 +88,6 @@ public: private: MachineFunction *MF; - const TargetMachine *TM; const X86InstrInfo *TII; // Machine instruction info. }; char FixupLEAPass::ID = 0; @@ -150,13 +149,11 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; - TM = &Func.getTarget(); - const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = - static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo()); + TII = ST.getInstrInfo(); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -219,7 +216,7 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return CurInst; } InstrDistance += TII->getInstrLatency( - TM->getSubtargetImpl()->getInstrItineraryData(), CurInst); + MF->getSubtarget().getInstrItineraryData(), CurInst); Found = getPreviousInstr(CurInst, MFI); } return nullptr; @@ -283,6 +280,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; int addrr_opcode, addri_opcode; switch (opcode) { + default: llvm_unreachable("Unexpected LEA instruction"); case X86::LEA16r: addrr_opcode = X86::ADD16rr; addri_opcode = X86::ADD16ri; @@ -296,8 +294,6 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, addrr_opcode = X86::ADD64rr; addri_opcode = X86::ADD64ri32; break; - default: - assert(false && "Unexpected LEA instruction"); } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); @@ -334,7 +330,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (TM->getSubtarget<X86Subtarget>().isSLM()) + if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); else processInstruction(I, MFI); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 6189109..c8e5f64 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -898,7 +898,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Now we should have the correct registers live. DEBUG(dumpStack()); - assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch"); + assert(StackTop == countPopulation(Mask) && "Live count mismatch"); } /// shuffleStackTop - emit fxch instructions before I to shuffle the top @@ -943,7 +943,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { } } - unsigned N = CountTrailingOnes_32(STReturns); + unsigned N = countTrailingOnes(STReturns); // FP registers used for function return must be consecutive starting at // FP0. @@ -1420,14 +1420,14 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { if (STUses && !isMask_32(STUses)) MI->emitError("fixed input regs must be last on the x87 stack"); - unsigned NumSTUses = CountTrailingOnes_32(STUses); + unsigned NumSTUses = countTrailingOnes(STUses); // Defs must be contiguous from the stack top. ST0-STn. if (STDefs && !isMask_32(STDefs)) { MI->emitError("output regs must be last on the x87 stack"); STDefs = NextPowerOf2(STDefs) - 1; } - unsigned NumSTDefs = CountTrailingOnes_32(STDefs); + unsigned NumSTDefs = countTrailingOnes(STDefs); // So must the clobbered stack slots. ST0-STm, m >= n. if (STClobbers && !isMask_32(STDefs | STClobbers)) @@ -1437,7 +1437,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { unsigned STPopped = STUses & (STDefs | STClobbers); if (STPopped && !isMask_32(STPopped)) MI->emitError("implicitly popped regs must be last on the x87 stack"); - unsigned NumSTPopped = CountTrailingOnes_32(STPopped); + unsigned NumSTPopped = countTrailingOnes(STPopped); DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index b9920b1..cead099 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -38,7 +38,34 @@ using namespace llvm; extern cl::opt<bool> ForceStackAlign; bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return !MF.getFrameInfo()->hasVarSizedObjects() && + !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Having a FP, as in the default +/// implementation, is not sufficient here since we can't always use it. +/// Use a more nuanced condition. +bool +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *> + (MF.getSubtarget().getRegisterInfo()); + return hasReservedCallFrame(MF) || + (hasFP(MF) && !TRI->needsStackRealignment(MF)) + || TRI->hasBasePointer(MF); +} + +// needsFrameIndexResolution - Do we need to perform FI resolution for +// this function. Normally, this is required only when the function +// has any stack objects. However, FI resolution actually has another job, +// not apparent from the title - it resolves callframesetup/destroy +// that were not simplified earlier. +// So, this is required for x86 functions that have push sequences even +// when there are no stack objects. +bool +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects() || + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -82,6 +109,14 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } +static unsigned getSUBrrOpcode(unsigned isLP64) { + return isLP64 ? X86::SUB64rr : X86::SUB32rr; +} + +static unsigned getADDrrOpcode(unsigned isLP64) { + return isLP64 ? X86::ADD64rr : X86::ADD32rr; +} + static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) @@ -155,6 +190,18 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, return 0; } +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. @@ -177,7 +224,33 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { - uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + + if (isSub && !isEAXLiveIn(*MBB.getParent())) + Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + + if (Reg) { + Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; + BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) + .addImm(Offset); + Opc = isSub + ? getSUBrrOpcode(Is64BitTarget) + : getADDrrOpcode(Is64BitTarget); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset = 0; + continue; + } + } + + uint64_t ThisVal = std::min(Offset, Chunk); if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub @@ -239,38 +312,6 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } -/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower -/// iterator. -static -void mergeSPUpdatesDown(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - // FIXME: THIS ISN'T RUN!!! - return; - - if (MBBI == MBB.end()) return; - - MachineBasicBlock::iterator NI = std::next(MBBI); - if (NI == MBB.end()) return; - - unsigned Opc = NI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } -} - /// mergeSPUpdates - Checks the instruction before/after the passed /// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and /// the stack adjustment is returned as a positive value for ADD/LEA and a @@ -306,19 +347,6 @@ static int mergeSPUpdates(MachineBasicBlock &MBB, return Offset; } -static bool isEAXLiveIn(MachineFunction &MF) { - for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), - EE = MF.getRegInfo().livein_end(); II != EE; ++II) { - unsigned Reg = II->first; - - if (Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) - return true; - } - - return false; -} - void X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -365,12 +393,23 @@ static bool usesTheStack(const MachineFunction &MF) { return false; } -void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, - unsigned &CallOp, - const char *&Symbol) { - CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32; +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + bool Is64Bit = STI.is64Bit(); + bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; - if (STI.is64Bit()) { + unsigned CallOp; + if (Is64Bit) + CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; + else + CallOp = X86::CALLpcrel32; + + const char *Symbol; + if (Is64Bit) { if (STI.isTargetCygMing()) { Symbol = "___chkstk_ms"; } else { @@ -380,6 +419,66 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, Symbol = "_alloca"; else Symbol = "_chkstk"; + + MachineInstrBuilder CI; + + // All current stack probes take AX and SP as input, clobber flags, and + // preserve all registers. x86_64 probes leave RSP unmodified. + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // For the large code model, we have to call through a register. Use R11, + // as it is scratch in all supported calling conventions. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) + .addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + } else { + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + } + + unsigned AX = Is64Bit ? X86::RAX : X86::EAX; + unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + CI.addReg(AX, RegState::Implicit) + .addReg(SP, RegState::Implicit) + .addReg(AX, RegState::Define | RegState::Implicit) + .addReg(SP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themselves. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } +} + +static unsigned calculateSetFPREG(uint64_t SPAdjust) { + // Win64 ABI has a less restrictive limitation of 240; 128 works equally well + // and might require smaller successive adjustments. + const uint64_t Win64MaxSEHOffset = 128; + uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); + // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. + return SEHFrameOffset & -16; +} + +// If we're forcing a stack realignment we can't rely on just the frame +// info, we need to know the ABI stack alignment as well in case we +// have a call out. Otherwise just make sure we have some alignment - we'll +// go with the minimum SlotSize. +static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned StackAlign = STI.getFrameLowering()->getStackAlignment(); + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + return MaxAlign; } /// emitPrologue - Push callee-saved registers onto the stack, which @@ -448,6 +547,8 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, [if needs base pointer] mov %rsp, %rbx + [if needs to restore base pointer] + mov %rsp, -MMM(%rbp) ; Emit CFI info [if needs FP] @@ -469,67 +570,65 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); bool IsWin64 = STI.isTargetWin64(); // Not necessarily synonymous with IsWin64. - bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == - ExceptionHandling::ItaniumWinEH; + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); - const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? - getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + const unsigned MachineFramePtr = + STI.isTarget64BitILP32() + ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum SlotSize. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else if (MaxAlign < SlotSize) - MaxAlign = SlotSize; - } - // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta && IsWinEH) + report_fatal_error("Can't handle guaranteed tail call under win64 yet"); + if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho()); - + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + if (Fn->hasFnAttribute("stack-probe-size")) + Fn->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoRedZone) && + if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone - !usesTheStack(MF) && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -570,14 +669,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers are pushed on stack before the stack - // is realigned. - FrameSize -= X86FI->getCalleeSavedFrameSize(); - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); - } + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + + // Callee-saved registers are pushed on stack before the stack is realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -613,11 +713,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } - // Update EBP with the new base value. - BuildMI(MBB, MBBI, DL, - TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) - .addReg(StackPtr) - .setMIFlag(MachineInstr::FrameSetup); + if (!IsWinEH) { + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), + FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. @@ -666,15 +769,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - if (RegInfo->needsStackRealignment(MF)) { + // Don't do this for Win64, it needs to realign the stack after the prologue. + if (!IsWinEH && RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); uint64_t Val = -MaxAlign; MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -685,14 +789,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); - // If there is an ADD32ri or SUB32ri of ESP immediately after this - // instruction, merge the two instructions. - mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); - // Adjust stack pointer: ESP -= numbytes. - static const size_t PageSize = 4096; - // Windows and cygwin/mingw require a prologue helper routine when allocating // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the @@ -701,12 +799,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= PageSize && UseStackProbe) { - const char *StackProbeSymbol; - unsigned CallOp; - - getStackProbeFunction(STI, CallOp, StackProbeSymbol); - + uint64_t AlignedNumBytes = NumBytes; + if (IsWinEH && RegInfo->needsStackRealignment(MF)) + AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); + if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -724,9 +820,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + if (isUInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. @@ -735,22 +841,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, - TII.get(CallOp)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; + + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbeCall(MF, MBB, MBBI, DL); + + // Apply the frame setup flag to all inserted instrs. + for (; SetRAX != MBBI; ++SetRAX) + SetRAX->setFlag(MachineInstr::FrameSetup); - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themself. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) - .addReg(StackPtr) - .addReg(X86::RAX) - .setMIFlag(MachineInstr::FrameSetup); - } if (isEAXAlive) { // Restore EAX MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), @@ -764,68 +865,66 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { UseLEA, TII, *RegInfo); } + if (NeedsWinEH && NumBytes) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + int SEHFrameOffset = 0; - if (NeedsWinEH) { - if (HasFP) { - // We need to set frame base offset low enough such that all saved - // register offsets would be positive relative to it, but we can't - // just use NumBytes, because .seh_setframe offset must be <=240. - // So we pretend to have only allocated enough space to spill the - // non-volatile registers. - // We don't care about the rest of stack allocation, because unwinder - // will restore SP to (BP - SEHFrameOffset) - for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { - int offset = MFI->getObjectOffset(Info.getFrameIdx()); - SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); - } - SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant - - // This only needs to account for XMM spill slots, GPR slots - // are covered by the .seh_pushreg's emitted above. - unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); - if (Size) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) - .addImm(Size) - .setMIFlag(MachineInstr::FrameSetup); - } + if (IsWinEH && HasFP) { + SEHFrameOffset = calculateSetFPREG(NumBytes); + if (SEHFrameOffset) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + StackPtr, false, SEHFrameOffset); + else + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); - } else { - // SP will be the base register for restoring XMMs - if (NumBytes) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - } } - // Skip the rest of register spilling code - while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + const MachineInstr *FrameInstr = &*MBBI; ++MBBI; - // Emit SEH info for non-GPRs - if (NeedsWinEH) { - for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { - unsigned Reg = Info.getReg(); - if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) - continue; - assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); - - int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); - Offset += SEHFrameOffset; - - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) - .addImm(Reg) - .addImm(Offset) - .setMIFlag(MachineInstr::FrameSetup); + if (NeedsWinEH) { + int FI; + if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (X86::FR64RegClass.contains(Reg)) { + int Offset = getFrameIndexOffset(MF, FI); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + } } + } + if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); + + // Realign stack after we spilled callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + // Win64 requires aligning the stack after the prologue. + if (IsWinEH && RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); } // If we need a base pointer, set it up here. It's whatever the value @@ -838,6 +937,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); + if (X86FI->getRestoreBasePointer()) { + // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { @@ -863,33 +970,45 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + bool HasFP = hasFP(MF); const bool Is64BitILP32 = STI.isTarget64BitILP32(); - bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); - unsigned MachineFramePtr = Is64BitILP32 ? - getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + unsigned MachineFramePtr = + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); - bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == - ExceptionHandling::ItaniumWinEH; + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); + bool UseLEAForSP = false; + + // We can't use LEA instructions for adjusting the stack pointer if this is a + // leaf function in the Win64 ABI. Only ADD instructions may be used to + // deallocate the stack. + if (STI.useLeaForSP()) { + if (!IsWinEH) { + // We *aren't* using the Win64 ABI which means we are free to use LEA. + UseLEAForSP = true; + } else if (HasFP) { + // We *have* a frame pointer which means we are permitted to use LEA. + UseLEAForSP = true; + } + } switch (RetOpcode) { default: - llvm_unreachable("Can only insert epilog into returning blocks"); + llvm_unreachable("Can only insert epilogue into returning blocks"); case X86::RETQ: case X86::RETL: case X86::RETIL: @@ -907,32 +1026,19 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); - uint64_t MaxAlign = MFI->getMaxAlignment(); + uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else - MaxAlign = MaxAlign ? MaxAlign : 4; - } - if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers were pushed on stack before the stack - // was realigned. - FrameSize -= CSSize; - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - CSSize; - } + NumBytes = FrameSize - CSSize; + + // Callee-saved registers were pushed on stack before the stack was + // realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); // Pop EBP. BuildMI(MBB, MBBI, DL, @@ -940,6 +1046,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } else { NumBytes = StackSize - CSSize; } + uint64_t SEHStackAllocAmt = NumBytes; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { @@ -967,10 +1074,20 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; - if (CSSize != 0) { + unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); + uint64_t LEAAmount = IsWinEH ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + + // There are only two legal forms of epilogue: + // - add SEHAllocationSize, %rsp + // - lea SEHAllocationSize(%FramePtr), %rsp + // + // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. + // However, we may use this sequence if we have a frame pointer because the + // effects of the prologue can safely be undone. + if (LEAAmount != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), - FramePtr, false, -CSSize); + FramePtr, false, LEAAmount); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); @@ -980,8 +1097,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, - TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); --MBBI; } @@ -1027,14 +1144,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, - UseLEA, TII, *RegInfo); + UseLEAForSP, TII, *RegInfo); } // Jump to label or value in register. + bool IsWin64 = STI.isTargetWin64(); if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) - ? X86::TAILJMPd : X86::TAILJMPd64)); + unsigned Op = (RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd + : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); @@ -1044,14 +1163,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, JumpTarget.getTargetFlags()); } } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) - ? X86::TAILJMPm : X86::TAILJMPm64)); + unsigned Op = (RetOpcode == X86::TCRETURNmi) + ? X86::TAILJMPm + : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); for (unsigned i = 0; i != 5; ++i) MIB.addOperand(MBBI->getOperand(i)); } else if (RetOpcode == X86::TCRETURNri64) { - BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). - addReg(JumpTarget.getReg(), RegState::Kill); + BuildMI(MBB, MBBI, DL, + TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) + .addReg(JumpTarget.getReg(), RegState::Kill); } else { BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). addReg(JumpTarget.getReg(), RegState::Kill); @@ -1071,24 +1192,58 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, - *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); } } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Offset will hold the offset from the stack pointer at function entry to the + // object. + // We need to factor in additional offsets applied during the prologue to the + // frame, base, and stack pointer depending on which is used. int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI->getStackSize(); + unsigned SlotSize = RegInfo->getSlotSize(); + bool HasFP = hasFP(MF); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int64_t FPDelta = 0; + + if (IsWinEH) { + assert(!MFI->hasCalls() || (StackSize % 16) == 8); + + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + uint64_t NumBytes = FrameSize - CSSize; + + uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); + if (FI && FI == X86FI->getFAIndex()) + return -SEHFrameOffset; + + // FPDelta is the offset from the "traditional" FP location of the old base + // pointer followed by return address and the location required by the + // restricted Win64 prologue. + // Add FPDelta to all offsets below that go through the frame pointer. + FPDelta = FrameSize - SEHFrameOffset; + assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && + "FPDelta isn't aligned per the Win64 ABI!"); + } + if (RegInfo->hasBasePointer(MF)) { - assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; @@ -1096,33 +1251,32 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, } else if (RegInfo->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls } else { - if (!hasFP(MF)) + if (!HasFP) return Offset + StackSize; // Skip the saved EBP. - Offset += RegInfo->getSlotSize(); + Offset += SlotSize; // Skip the RETADDR move area - const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } - return Offset; + return Offset + FPDelta; } int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. @@ -1135,12 +1289,85 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } +// Simplified from getFrameIndexOffset keeping only StackPointer cases +int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Does not include any dynamic realign. + const uint64_t StackSize = MFI->getStackSize(); + { +#ifndef NDEBUG + const X86RegisterInfo *RegInfo = + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + // Note: LLVM arranges the stack as: + // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) + // > "Stack Slots" (<--SP) + // We can always address StackSlots from RSP. We can usually (unless + // needsStackRealignment) address CSRs from RSP, but sometimes need to + // address them from RBP. FixedObjects can be placed anywhere in the stack + // frame depending on their specific requirements (i.e. we can actually + // refer to arguments to the function which are stored in the *callers* + // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs + // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + // We don't handle tail calls, and shouldn't be seeing them + // either. + int TailCallReturnAddrDelta = + MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); + assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); +#endif + } + + // This is how the math works out: + // + // %rsp grows (i.e. gets lower) left to right. Each box below is + // one word (eight bytes). Obj0 is the stack slot we're trying to + // get to. + // + // ---------------------------------- + // | BP | Obj0 | Obj1 | ... | ObjN | + // ---------------------------------- + // ^ ^ ^ ^ + // A B C E + // + // A is the incoming stack pointer. + // (B - A) is the local area offset (-8 for x86-64) [1] + // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // + // |(E - B)| is the StackSize (absolute value, positive). For a + // stack that grown down, this works out to be (B - E). [3] + // + // E is also the value of %rsp after stack has been set up, and we + // want (C - E) -- the value we can add to %rsp to get to Obj0. Now + // (C - E) == (C - A) - (B - A) + (B - E) + // { Using [1], [2] and [3] above } + // == getObjectOffset - LocalAreaOffset + StackSize + // + + // Get the Offset from the StackPointer + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + + return Offset + StackSize; +} +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + FrameReg = RegInfo->getStackRegister(); + return getFrameIndexOffsetFromSP(MF, FI); +} + bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1207,8 +1434,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; @@ -1228,8 +1455,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // It can be done by spilling XMMs to stack frame. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); @@ -1255,8 +1481,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -1287,7 +1513,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1368,9 +1594,9 @@ void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); uint64_t StackSize; - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; @@ -1382,8 +1608,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); - if (!STI.isTargetLinux() && !STI.isTargetDarwin() && - !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD()) + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && + !STI.isTargetWin64() && !STI.isTargetFreeBSD() && + !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will @@ -1437,6 +1664,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } @@ -1459,6 +1689,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { @@ -1471,7 +1704,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) { + if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || + STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { @@ -1515,7 +1749,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -1546,12 +1780,36 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } // __morestack is in libgcc - if (Is64Bit) - BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack"); - else - BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack"); + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // Under the large code model, we cannot assume that __morestack lives + // within 2^31 bytes of the call site, so we cannot use pc-relative + // addressing. We cannot perform the call via a temporary register, + // as the rax register may be used to store the static chain, and all + // other suitable registers may be either callee-save or used for + // parameter passing. We cannot use the stack at this point either + // because __morestack manipulates the stack directly. + // + // To avoid these issues, perform an indirect call via a read-only memory + // location containing the address. + // + // This solution is not perfect, as it assumes that the .rodata section + // is laid out within 2^31 bytes of each function body, but this seems + // to be sufficient for JIT. + BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addExternalSymbol("__morestack_addr") + .addReg(0); + MF.getMMI().setUsesMorestackAddr(true); + } else { + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); @@ -1584,12 +1842,10 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) - ->getSlotSize(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const unsigned SlotSize = STI.getRegisterInfo()->getSlotSize(); const bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; @@ -1695,7 +1951,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -1704,7 +1960,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); stackCheckMBB->addSuccessor(&prologueMBB, 99); stackCheckMBB->addSuccessor(incStackMBB, 1); @@ -1719,50 +1975,45 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const X86RegisterInfo &RegInfo = *STI.getRegisterInfo(); unsigned StackPtr = RegInfo.getStackRegister(); - bool reseveCallFrame = hasReservedCallFrame(MF); + bool reserveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); - if (!reseveCallFrame) { + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' - // TODO: consider using push / pop instead of sub + store / add if (Amount == 0) return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment(); - Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + unsigned StackAlign = getStackAlignment(); + Amount = RoundUpToAlignment(Amount, StackAlign); MachineInstr *New = nullptr; - if (Opcode == TII.getCallFrameSetupOpcode()) { - New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), - StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } else { - assert(Opcode == TII.getCallFrameDestroyOpcode()); - // Factor out the amount the callee already popped. - Amount -= CalleeAmt; + // Factor out the amount that gets handled inside the sequence + // (Pushes of argument for frame setup, callee pops for frame destroy) + Amount -= InternalAmt; + + if (Amount) { + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); - if (Amount) { unsigned Opc = getADDriOpcode(IsLP64, Amount); New = BuildMI(MF, DL, TII.get(Opc), StackPtr) .addReg(StackPtr).addImm(Amount); @@ -1780,13 +2031,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return; } - if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. - unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(CalleeAmt); + .addReg(StackPtr).addImm(InternalAmt); // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead(); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 7740c3a..542bbbc 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -18,18 +18,16 @@ namespace llvm { -class MCSymbol; -class X86TargetMachine; -class X86Subtarget; - class X86FrameLowering : public TargetFrameLowering { public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} - static void getStackProbeFunction(const X86Subtarget &STI, - unsigned &CallOp, - const char *&Symbol); + /// Emit a call to the target's stack probe function. This is required for all + /// large stack allocations on Windows. The caller is required to materialize + /// the number of bytes to probe in RAX/EAX. + static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -64,14 +62,30 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; + bool needsFrameIndexResolution(const MachineFunction &MF) const override; int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; + int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + +private: + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 3ef7b2c..8d50ae1 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -156,9 +156,7 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), - Subtarget(&tm.getSubtarget<X86Subtarget>()), - OptForSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -166,7 +164,7 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. - Subtarget = &TM.getSubtarget<X86Subtarget>(); + Subtarget = &MF.getSubtarget<X86Subtarget>(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -233,7 +231,7 @@ namespace { char ConstraintCode, std::vector<SDValue> &OutOps) override; - void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); + void EmitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, SDValue &Scale, SDValue &Index, @@ -298,7 +296,7 @@ namespace { /// getInstrInfo - Return a reference to the TargetInstrInfo, casted /// to the target-specific type. const X86InstrInfo *getInstrInfo() const { - return getTargetMachine().getSubtargetImpl()->getInstrInfo(); + return Subtarget->getInstrInfo(); } /// \brief Address-mode matching performs shift-of-and to and-of-shift @@ -395,17 +393,14 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, Ops.clear(); Ops.push_back(NewChain); } - for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i) - Ops.push_back(OrigChain.getOperand(i)); + Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); - unsigned NumOps = Call.getNode()->getNumOperands(); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); - for (unsigned i = 1, e = NumOps; i != e; ++i) - Ops.push_back(Call.getOperand(i)); + Ops.append(Call->op_begin() + 1, Call->op_end()); CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } @@ -453,8 +448,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { @@ -571,14 +565,18 @@ void X86DAGToDAGISel::PreprocessISelDAG() { /// EmitSpecialCodeForMain - Emit any code that needs to be executed only in /// the main function. -void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, - MachineFrameInfo *MFI) { - const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo(); +void X86DAGToDAGISel::EmitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { - unsigned CallOp = - Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; - BuildMI(BB, DebugLoc(), - TII->get(CallOp)).addExternalSymbol("__main"); + TargetLowering::ArgListTy Args; + + TargetLowering::CallLoweringInfo CLI(*CurDAG); + CLI.setChain(CurDAG->getRoot()) + .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), + CurDAG->getExternalSymbol("__main", TLI->getPointerTy()), + std::move(Args), 0); + const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); + std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); + CurDAG->setRoot(Result.second); } } @@ -586,7 +584,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. if (const Function *Fn = MF->getFunction()) if (Fn->hasExternalLinkage() && Fn->getName() == "main") - EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo()); + EmitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -918,7 +916,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; // We also need to ensure that mask is a continuous run of bits. - if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; // Scale the leading zero count down based on the actual size of the value. // Also scale it down based on the size of the shift. @@ -1891,8 +1889,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) { case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: - case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4: - case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4: + case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1: + case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1: case X86::CMOVA16rr: case X86::CMOVA16rm: case X86::CMOVA32rr: case X86::CMOVA32rm: case X86::CMOVA64rr: case X86::CMOVA64rm: @@ -2504,7 +2502,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = @@ -2612,26 +2610,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) { - // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a - // smaller encoding - if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 && - X86::isZeroNode(N1)) { - SDValue Reg = N0.getOperand(0); - SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8); - - // Emit testb - if (Reg.getScalarValueSizeInBits() > 8) - Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg); - // Emit a testb. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, - Reg, Imm); - ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); - return nullptr; - } - + HasNoSignedComparisonUses(Node)) N0 = N0.getOperand(0); - } + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f05b6c6..6866be7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15,6 +15,7 @@ #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" @@ -66,11 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -static cl::opt<bool> ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(true), - cl::desc("Enable an experimental vector shuffle lowering code path."), - cl::Hidden); - static cl::opt<int> ReciprocalEstimateRefinementSteps( "x86-recip-refinement-steps", cl::init(1), cl::desc("Specify the number of Newton-Raphson iterations applied to the " @@ -107,21 +103,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, - VecIdx); - - return Result; - + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } + /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { @@ -158,25 +151,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, - VecIdx); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } + /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes +/// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG,SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } @@ -199,44 +190,23 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -// FIXME: This should stop caching the target machine as soon as -// we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); TD = getDataLayout(); - resetOperationActions(); -} - -void X86TargetLowering::resetOperationActions() { - const TargetMachine &TM = getTargetMachine(); - static bool FirstTimeThrough = true; - - // If none of the target options have changed, then we don't need to reset the - // operation actions. - if (!FirstTimeThrough && TO == TM.Options) return; - - if (!FirstTimeThrough) { - // Reinitialize the actions. - initActions(); - FirstTimeThrough = false; - } - - TO = TM.Options; - // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; - // X86 is weird, it always uses i8 for shift amounts and setcc results. + // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - // For 64-bit since we have so many registers use the ILP scheduler, for - // 32-bit code use the register pressure specific scheduling. + // For 64-bit, since we have so many registers, use the ILP scheduler. + // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); @@ -244,14 +214,14 @@ void X86TargetLowering::resetOperationActions() { setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); - const X86RegisterInfo *RegInfo = - TM.getSubtarget<X86Subtarget>().getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2 - if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { - addBypassSlowDiv(32, 8); - if (Subtarget->is64Bit()) + // Bypass expensive divides on Atom when compiling with O2. + if (TM.getOptLevel() >= CodeGenOpt::Default) { + if (Subtarget->hasSlowDivide32()) + addBypassSlowDiv(32, 8); + if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } @@ -296,7 +266,8 @@ void X86TargetLowering::resetOperationActions() { if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); @@ -521,7 +492,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); @@ -805,9 +778,7 @@ void X86TargetLowering::resetOperationActions() { // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; + for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); @@ -876,18 +847,19 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction(VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(InnerVT, VT, Expand); + + setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); - // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, - // we have to deal with them whether we ask for Expansion or not. Setting - // Expand causes its own optimisation problems though, so leave them legal. - if (VT.getVectorElementType() == MVT::i1) - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like + // types, we have to deal with them whether we ask for Expansion or not. + // Setting Expand causes its own optimisation problems though, so leave + // them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -942,6 +914,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); @@ -991,6 +964,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each vector + // element. If popcnt isn't supported, always provide the custom version. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1002,6 +983,7 @@ void X86TargetLowering::resetOperationActions() { continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } @@ -1009,20 +991,24 @@ void X86TargetLowering::resetOperationActions() { // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom); + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); @@ -1070,7 +1056,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -1103,20 +1090,32 @@ void X86TargetLowering::resetOperationActions() { // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); - // There is no BLENDI for byte vectors. We don't need to custom lower - // some vselects for now. + // We directly match byte blends in the backend as they match the VSELECT + // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom); + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1212,7 +1211,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1232,11 +1232,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); @@ -1280,12 +1275,34 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each + // vector element. If popcnt isn't supported, always provide the custom + // version. + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + + // Custom CTPOP always performs better on natively supported v8i32 + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + + // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1314,21 +1331,23 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) + if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); @@ -1336,6 +1355,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } + if (Subtarget->hasInt256()) + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1367,12 +1390,14 @@ void X86TargetLowering::resetOperationActions() { addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); @@ -1434,6 +1459,17 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); + setOperationAction(ISD::FRINT, MVT::v16f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); @@ -1486,16 +1522,13 @@ void X86TargetLowering::resetOperationActions() { } // Custom lower several nodes. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) + if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); @@ -1511,12 +1544,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to promote non-256-bit vectors + // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; @@ -1536,17 +1571,22 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::LOAD, MVT::v64i8, Legal); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, Legal); + setOperationAction(ISD::ADD, MVT::v64i8, Legal); + setOperationAction(ISD::SUB, MVT::v32i16, Legal); + setOperationAction(ISD::SUB, MVT::v64i8, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, Legal); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - // Do not attempt to promote non-256-bit vectors + // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; - if ( EltSize < 32) { + if (EltSize < 32) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); } @@ -1560,14 +1600,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); - } - // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion - // of this type with custom code. - for (int VT = MVT::FIRST_VECTOR_VALUETYPE; - VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Custom); + setOperationAction(ISD::AND, MVT::v8i32, Legal); + setOperationAction(ISD::OR, MVT::v8i32, Legal); + setOperationAction(ISD::XOR, MVT::v8i32, Legal); + setOperationAction(ISD::AND, MVT::v4i32, Legal); + setOperationAction(ISD::OR, MVT::v4i32, Legal); + setOperationAction(ISD::XOR, MVT::v4i32, Legal); } // We want to custom lower some of our intrinsics. @@ -1607,9 +1646,8 @@ void X86TargetLowering::resetOperationActions() { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { - // For MacOSX, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. + // For MacOSX, we don't want the normal expansion of a libcall to sincos. + // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } @@ -1627,6 +1665,7 @@ void X86TargetLowering::resetOperationActions() { // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1640,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1650,11 +1691,10 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); - if (Subtarget->is64Bit()) - setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. @@ -1668,7 +1708,7 @@ void X86TargetLowering::resetOperationActions() { // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); - + EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); @@ -1676,8 +1716,7 @@ void X86TargetLowering::resetOperationActions() { // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { - return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO && - Subtarget->is64Bit(); + return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction @@ -1733,7 +1772,7 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { return VT.changeVectorElementTypeToInteger(); } -/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) @@ -1758,7 +1797,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { } } -/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. @@ -1777,7 +1816,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { return Align; } -/// getOptimalMemOpType - Returns the target specific optimal type for load +/// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it @@ -1796,8 +1835,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) { + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && @@ -1843,7 +1881,7 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return true; } -/// getJumpTableEncoding - Return the entry encoding for a jump table in the +/// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { @@ -1869,8 +1907,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, MCSymbolRefExpr::VK_GOTOFF, Ctx); } -/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC -/// jumptable. +/// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) @@ -1880,9 +1917,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, return Table; } -/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the -/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an -/// MCExpr. +/// This returns the relocation base for the given PIC jumptable, +/// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { @@ -1894,14 +1930,14 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } -// FIXME: Why this routine is here? Move to RegInfo! -std::pair<const TargetRegisterClass*, uint8_t> -X86TargetLowering::findRepresentativeClass(MVT VT) const{ +std::pair<const TargetRegisterClass *, uint8_t> +X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: - return TargetLowering::findRepresentativeClass(VT); + return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; @@ -1994,7 +2030,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); - // Promote values to the appropriate types + // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) @@ -2005,7 +2041,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && - "Unexpected FP-extend for return value."); + "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. @@ -2060,14 +2096,15 @@ X86TargetLowering::LowerReturn(SDValue Chain, // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. - if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { - MachineFunction &MF = DAG.getMachineFunction(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - unsigned Reg = FuncInfo->getSRetReturnReg(); - assert(Reg && - "SRetReturnReg should have been set in LowerFormalArguments()."); - SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); + // + // Checking Function.hasStructRetAttr() here is insufficient because the IR + // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is + // false, then an sret argument may be implicitly inserted in the SelDAG. In + // either case FuncInfo->setSRetReturnReg() will have been called. + if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) && + "No need for an sret register"); + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? @@ -2141,7 +2178,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, return VT.bitsLT(MinVT) ? MinVT : VT; } -/// LowerCallResult - Lower the result values of a call into the +/// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue @@ -2221,8 +2258,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { return StackStructReturn; } -/// ArgsAreStructReturn - Determines whether a function uses struct -/// return semantics. +/// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { if (Ins.empty()) @@ -2236,10 +2272,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { return StackStructReturn; } -/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified -/// by "Src" to address "Dst" with size and alignment information specified by -/// the specific parameter attribute. The copy will be passed as a byval -/// function parameter. +/// Make a copy of an aggregate at address specified by "Src" to address +/// "Dst" with size and alignment information specified by the specific +/// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, @@ -2251,7 +2286,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// IsTailCallConvention - Return true if the calling convention is one that +/// Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || @@ -2276,7 +2311,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { return true; } -/// FuncIsMadeTailCallSafe - Return true if the function is being made into +/// Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt) { @@ -2356,8 +2391,7 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, } const Function *Fn = MF.getFunction(); - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || @@ -2523,18 +2557,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MFI->CreateFixedObject(1, StackSize, true)); } + // Figure out if XMM registers are in use. + assert(!(MF.getTarget().Options.UseSoftFloat && + Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && + "SSE register cannot be used when SSE is disabled!"); + // 64-bit calling conventions support varargs and register parameters, so we - // have to do extra work to spill them in the prologue or forward them to - // musttail calls. - if (Is64Bit && isVarArg && - (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) { + // have to do extra work to spill them in the prologue. + if (Is64Bit && isVarArg && MFI->hasVAStart()) { // Find the first unallocated argument registers. ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); - unsigned NumIntRegs = - CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); - unsigned NumXMMRegs = - CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); @@ -2557,90 +2592,99 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } } - // Store them to the va_list returned by va_start. - if (MFI->hasVAStart()) { - if (IsWin64) { - const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( + if (IsWin64) { + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by deferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); - } - - // Store the integer parameter registers. - SmallVector<SDValue, 8> MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset)); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); - MemOps.push_back(Store); - Offset += 8; - } - - if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector<SDValue, 12> SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex())); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset())); - SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), - LiveXMMRegs.end()); - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { - // Add all GPRs, al, and XMMs to the list of forwards. We will add then - // to the liveout set on a musttail call. - assert(MFI->hasMustTailInVarArgFunc()); - auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); - typedef X86MachineFunctionInfo::Forward Forward; - - for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) { - unsigned VReg = - MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass); - Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]); - Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64)); - } - - if (!ArgXMMs.empty()) { - unsigned ALVReg = - MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass); - Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal); - Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8)); - - for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) { - unsigned VReg = - MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass); - Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]); - Forwards.push_back( - Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32)); - } - } + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 12> SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex())); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset())); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } + + if (isVarArg && MFI->hasMustTailInVarArgFunc()) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget->hasAVX512() && + (Is64Bit || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget->hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget->hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector<MVT, 2> RegParmTypes; + MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Conservatively forward AL on x86_64, since it might be used for varargs. + if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &F : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); + Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } @@ -2688,7 +2732,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, false, false, 0); } -/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call +/// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, @@ -2705,7 +2749,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call +/// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, @@ -2838,8 +2882,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2952,7 +2995,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); @@ -2960,7 +3003,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } - if (Is64Bit && isVarArg && IsMustTail) { + if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); @@ -3044,10 +3087,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. + GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. @@ -3073,11 +3117,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; - } else if (Subtarget->isPICStyleRIPRel() && - isa<Function>(GV) && - cast<Function>(GV)->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::NonLazyBind)) { + } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && + cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -3117,7 +3158,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); - } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { + } else if (Subtarget->isTarget64BitILP32() && + Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } @@ -3146,7 +3188,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3235,11 +3277,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const TargetMachine &TM = MF.getTarget(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3276,7 +3315,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return false; } else { unsigned Opcode = Def->getOpcode(); - if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); @@ -3341,6 +3381,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); + // Win64 functions have extra shadow space for argument homing. Don't do the + // sibcall if the caller and callee have mismatched expectations for this + // space. + if (IsCalleeWin64 != IsCallerWin64) + return false; + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; @@ -3352,8 +3398,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3465,8 +3510,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3494,7 +3538,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = - (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3563,17 +3607,6 @@ static bool isTargetShuffle(unsigned Opcode) { } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, - SDValue V1, SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: - case X86ISD::MOVDDUP: - return DAG.getNode(Opc, dl, VT, V1); - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3588,20 +3621,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, - SDValue V1, SDValue V2, unsigned TargetMask, - SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::PALIGNR: - case X86ISD::VALIGN: - case X86ISD::SHUFP: - case X86ISD::VPERM2X128: - return DAG.getNode(Opc, dl, VT, V1, V2, - DAG.getConstant(TargetMask, MVT::i8)); - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3620,8 +3639,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3661,7 +3679,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. - if (M == CodeModel::Kernel && Offset > 0) + if (M == CodeModel::Kernel && Offset >= 0) return true; return false; @@ -3823,6 +3841,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF + // relocation target a movq or addq instruction: don't let the load shrink. + SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); + if (BasePtr.getOpcode() == X86ISD::WrapperRIP) + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) + return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + return true; +} + /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -3835,6 +3865,24 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Speculate cttz only if we can directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Speculate ctlz only if we can directly use LZCNT. + return Subtarget->hasLZCNT(); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -3849,7 +3897,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified -/// sequential range (L, L+Pos]. or is undef. +/// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) @@ -3858,176 +3906,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFD. That is, it doesn't reference the other -/// operand - by default will match for first operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT, - bool TestSecondOperand = false) { - if (VT != MVT::v4f32 && VT != MVT::v4i32 && - VT != MVT::v2f64 && VT != MVT::v2i64) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - unsigned Lo = TestSecondOperand ? NumElems : 0; - unsigned Hi = Lo + NumElems; - - for (unsigned i = 0; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) - return false; - - return true; -} - -/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) - return false; - - // Lower quadword copied in order or undef. - if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) - return false; - - // Upper quadword shuffled. - for (unsigned i = 4; i != 8; ++i) - if (!isUndefOrInRange(Mask[i], 4, 8)) - return false; - - if (VT == MVT::v16i16) { - // Lower quadword copied in order or undef. - if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) - return false; - - // Upper quadword shuffled. - for (unsigned i = 12; i != 16; ++i) - if (!isUndefOrInRange(Mask[i], 12, 16)) - return false; - } - - return true; -} - -/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) - return false; - - // Upper quadword copied in order. - if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) - return false; - - // Lower quadword shuffled. - for (unsigned i = 0; i != 4; ++i) - if (!isUndefOrInRange(Mask[i], 0, 4)) - return false; - - if (VT == MVT::v16i16) { - // Upper quadword copied in order. - if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) - return false; - - // Lower quadword shuffled. - for (unsigned i = 8; i != 12; ++i) - if (!isUndefOrInRange(Mask[i], 8, 12)) - return false; - } - - return true; -} - -/// \brief Return true if the mask specifies a shuffle of elements that is -/// suitable for input to intralane (palignr) or interlane (valign) vector -/// right-shift. -static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - // Do not handle 64-bit element shuffles with palignr. - if (NumLaneElts == 2) - return false; - - for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { - unsigned i; - for (i = 0; i != NumLaneElts; ++i) { - if (Mask[i+l] >= 0) - break; - } - - // Lane is all undef, go to next lane - if (i == NumLaneElts) - continue; - - int Start = Mask[i+l]; - - // Make sure its in this lane in one of the sources - if (!isUndefOrInRange(Start, l, l+NumLaneElts) && - !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) - return false; - - // If not lane 0, then we must match lane 0 - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) - return false; - - // Correct second source to be contiguous with first source - if (Start >= (int)NumElts) - Start -= NumElts - NumLaneElts; - - // Make sure we're shifting in the right direction. - if (Start <= (int)(i+l)) - return false; - - Start -= i; - - // Check the rest of the elements to see if they are consecutive. - for (++i; i != NumLaneElts; ++i) { - int Idx = Mask[i+l]; - - // Make sure its in this lane - if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && - !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) - return false; - - // If not lane 0, then we must match lane 0 - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) - return false; - - if (Idx >= (int)NumElts) - Idx -= NumElts - NumLaneElts; - - if (!isUndefOrEqual(Idx, Start+i)) - return false; - - } - } - - return true; -} - -/// \brief Return true if the node specifies a shuffle of elements that is -/// suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || - (VT.is256BitVector() && !Subtarget->hasInt256()) || - VT.is512BitVector()) - // FIXME: Add AVX512BW. - return false; - - return isAlignrMask(Mask, VT, false); -} - -/// \brief Return true if the node specifies a shuffle of elements that is -/// suitable for input to VALIGN. -static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - // FIXME: Add AVX512VL. - if (!VT.is512BitVector() || !Subtarget->hasAVX512()) - return false; - return isAlignrMask(Mask, VT, true); -} - /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, @@ -4043,664 +3921,6 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, } } -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128/256-bit -/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be -/// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { - - unsigned NumElems = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElems = NumElems/NumLanes; - - if (NumLaneElems != 2 && NumLaneElems != 4) - return false; - - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - bool symetricMaskRequired = - (VT.getSizeInBits() >= 256) && (EltSize == 32); - - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - // VSHUFPDY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X3 X2 X1 X0 - // SRC2 => Y3 Y2 Y1 Y0 - // - // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 - // - SmallVector<int, 4> MaskVal(NumLaneElems, -1); - unsigned HalfLaneElems = NumLaneElems/2; - for (unsigned l = 0; l != NumElems; l += NumLaneElems) { - for (unsigned i = 0; i != NumLaneElems; ++i) { - int Idx = Mask[i+l]; - unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); - if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) - return false; - // For VSHUFPSY, the mask of the second half must be the same as the - // first but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - if (!symetricMaskRequired || Idx < 0) - continue; - if (MaskVal[i] < 0) { - MaskVal[i] = Idx - l; - continue; - } - if ((signed)(Idx - l) != MaskVal[i]) - return false; - } - } - - return true; -} - -/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVHLPS. -static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 4) - return false; - - // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 - return isUndefOrEqual(Mask[0], 6) && - isUndefOrEqual(Mask[1], 7) && - isUndefOrEqual(Mask[2], 2) && - isUndefOrEqual(Mask[3], 3); -} - -/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form -/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, -/// <2, 3, 2, 3> -static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 4) - return false; - - return isUndefOrEqual(Mask[0], 2) && - isUndefOrEqual(Mask[1], 3) && - isUndefOrEqual(Mask[2], 2) && - isUndefOrEqual(Mask[3], 3); -} - -/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i + NumElems)) - return false; - - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - return true; -} - -/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVLHPS. -static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i + e], i + NumElems)) - return false; - - return true; -} - -/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to INSERTPS. -/// i. e: If all but one element come from the same vector. -static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { - // TODO: Deal with AVX's VINSERTPS - if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) - return false; - - unsigned CorrectPosV1 = 0; - unsigned CorrectPosV2 = 0; - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { - if (Mask[i] == -1) { - ++CorrectPosV1; - ++CorrectPosV2; - continue; - } - - if (Mask[i] == i) - ++CorrectPosV1; - else if (Mask[i] == i + 4) - ++CorrectPosV2; - } - - if (CorrectPosV1 == 3 || CorrectPosV2 == 3) - // We have 3 elements (undefs count as elements from any vector) from one - // vector, and one from another. - return true; - - return false; -} - -// -// Some special combinations that can be optimized. -// -static -SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDLoc dl(SVOp); - - if (VT != MVT::v8i32 && VT != MVT::v8f32) - return SDValue(); - - ArrayRef<int> Mask = SVOp->getMask(); - - // These are the special masks that may be optimized. - static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; - static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; - bool MatchEvenMask = true; - bool MatchOddMask = true; - for (int i=0; i<8; ++i) { - if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) - MatchEvenMask = false; - if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) - MatchOddMask = false; - } - - if (!MatchEvenMask && !MatchOddMask) - return SDValue(); - - SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - - SDValue Op0 = SVOp->getOperand(0); - SDValue Op1 = SVOp->getOperand(1); - - if (MatchEvenMask) { - // Shift the second operand right to 32 bits. - static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; - Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); - } else { - // Shift the first operand left to 32 bits. - static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; - Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); - } - static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); -} - -/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, - bool HasInt256, bool V2IsSplat = false) { - - assert(VT.getSizeInBits() >= 128 && - "Unsupported vector type for unpckl"); - - unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && - "Unsupported vector type for unpckh"); - - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) - return false; - } - } - } - - return true; -} - -/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, - bool HasInt256, bool V2IsSplat = false) { - assert(VT.getSizeInBits() >= 128 && - "Unsupported vector type for unpckh"); - - unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && - "Unsupported vector type for unpckh"); - - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j+NumElts)) - return false; - } - } - } - return true; -} - -/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form -/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, -/// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - unsigned NumElts = VT.getVectorNumElements(); - bool Is256BitVec = VT.is256BitVector(); - - if (VT.is512BitVector()) - return false; - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); - - if (Is256BitVec && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern - // FIXME: Need a better way to get rid of this, there's no latency difference - // between UNPCKLPD and MOVDDUP, the later should always be checked first and - // the former later. We should also remove the "_undef" special mask. - if (NumElts == 4 && Is256BitVec) - return false; - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; - } - } - - return true; -} - -/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form -/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, -/// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - unsigned NumElts = VT.getVectorNumElements(); - - if (VT.is512BitVector()) - return false; - - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); - - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; - } - } - return true; -} - -// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or -// (src1[0], src0[1]), manipulation with 256-bit sub-vectors -static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) { - if (!VT.is512BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfSize = NumElts/2; - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { - *Imm = 1; - return true; - } - } - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { - *Imm = 0; - return true; - } - } - return false; -} - -/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSS, -/// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { - if (VT.getVectorElementType().getSizeInBits() < 32) - return false; - if (!VT.is128BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - - if (!isUndefOrEqual(Mask[0], NumElts)) - return false; - - for (unsigned i = 1; i != NumElts; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - return true; -} - -/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered -/// as permutations between 128-bit chunks or halves. As an example: this -/// shuffle bellow: -/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> -/// The first half comes from the second half of V1 and the second half from the -/// the second half of V2. -static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { - if (!HasFp256 || !VT.is256BitVector()) - return false; - - // The shuffle result is divided into half A and half B. In total the two - // sources have 4 halves, namely: C, D, E, F. The final values of A and - // B must come from C, D, E or F. - unsigned HalfSize = VT.getVectorNumElements()/2; - bool MatchA = false, MatchB = false; - - // Check if A comes from one of C, D, E, F. - for (unsigned Half = 0; Half != 4; ++Half) { - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { - MatchA = true; - break; - } - } - - // Check if B comes from one of C, D, E, F. - for (unsigned Half = 0; Half != 4; ++Half) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { - MatchB = true; - break; - } - } - - return MatchA && MatchB; -} - -/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. -static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getSimpleValueType(0); - - unsigned HalfSize = VT.getVectorNumElements()/2; - - unsigned FstHalf = 0, SndHalf = 0; - for (unsigned i = 0; i < HalfSize; ++i) { - if (SVOp->getMaskElt(i) > 0) { - FstHalf = SVOp->getMaskElt(i)/HalfSize; - break; - } - } - for (unsigned i = HalfSize; i < HalfSize*2; ++i) { - if (SVOp->getMaskElt(i) > 0) { - SndHalf = SVOp->getMaskElt(i)/HalfSize; - break; - } - } - - return (FstHalf | (SndHalf << 4)); -} - -// Symetric in-lane mask. Each lane has 4 elements (for imm8) -static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (EltSize < 32) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - Imm8 = 0; - if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { - for (unsigned i = 0; i != NumElts; ++i) { - if (Mask[i] < 0) - continue; - Imm8 |= Mask[i] << (i*2); - } - return true; - } - - unsigned LaneSize = 4; - SmallVector<int, 4> MaskVal(LaneSize, -1); - - for (unsigned l = 0; l != NumElts; l += LaneSize) { - for (unsigned i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) - return false; - if (Mask[i+l] < 0) - continue; - if (MaskVal[i] < 0) { - MaskVal[i] = Mask[i+l] - l; - Imm8 |= MaskVal[i] << (i*2); - continue; - } - if (Mask[i+l] != (signed)(MaskVal[i]+l)) - return false; - } - } - return true; -} - -/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (VT.getSizeInBits() < 256 || EltSize < 32) - return false; - bool symetricMaskRequired = (EltSize == 32); - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned LaneSize = NumElts/NumLanes; - // 2 or 4 elements in one lane - - SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); - for (unsigned l = 0; l != NumElts; l += LaneSize) { - for (unsigned i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) - return false; - if (symetricMaskRequired) { - if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { - ExpectedMaskVal[i] = Mask[i+l] - l; - continue; - } - if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) - return false; - } - } - } - return true; -} - -/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse -/// of what x86 movss want. X86 movs requires the lowest element to be lowest -/// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, - bool V2IsSplat = false, bool V2IsUndef = false) { - if (!VT.is128BitVector()) - return false; - - unsigned NumOps = VT.getVectorNumElements(); - if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) - return false; - - if (!isUndefOrEqual(Mask[0], 0)) - return false; - - for (unsigned i = 1; i != NumOps; ++i) - if (!(isUndefOrEqual(Mask[i], i+NumOps) || - (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || - (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) - return false; - - return true; -} - -/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. -/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8) || - (VT.is512BitVector() && NumElems != 16)) - return false; - - // "i+1" is the value the indexed mask element must have - for (unsigned i = 0; i != NumElems; i += 2) - if (!isUndefOrEqual(Mask[i], i+1) || - !isUndefOrEqual(Mask[i+1], i+1)) - return false; - - return true; -} - -/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. -/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8) || - (VT.is512BitVector() && NumElems != 16)) - return false; - - // "i" is the value the indexed mask element must have - for (unsigned i = 0; i != NumElems; i += 2) - if (!isUndefOrEqual(Mask[i], i) || - !isUndefOrEqual(Mask[i+1], i)) - return false; - - return true; -} - -/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// version of MOVDDUP. -static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { - if (!HasFp256 || !VT.is256BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 4) - return false; - - for (unsigned i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(Mask[i], 0)) - return false; - for (unsigned i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(Mask[i], NumElts/2)) - return false; - return true; -} - -/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit -/// version of MOVDDUP. -static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned e = VT.getVectorNumElements() / 2; - for (unsigned i = 0; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - for (unsigned i = 0; i != e; ++i) - if (!isUndefOrEqual(Mask[e+i], i)) - return false; - return true; -} - /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors @@ -4754,125 +3974,6 @@ bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } -/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -/// Handles 128-bit and 256-bit. -static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT.getSizeInBits() >= 128) && - "Unsupported vector type for PSHUF/SHUFP"); - - // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate - // independently on 128-bit lanes. - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && - "Only supports 2, 4 or 8 elements per lane"); - - unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; - unsigned Mask = 0; - for (unsigned i = 0; i != NumElts; ++i) { - int Elt = N->getMaskElt(i); - if (Elt < 0) continue; - Elt &= NumLaneElts - 1; - unsigned ShAmt = (i << Shift) % 8; - Mask |= Elt << ShAmt; - } - - return Mask; -} - -/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. -static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT == MVT::v8i16 || VT == MVT::v16i16) && - "Unsupported vector type for PSHUFHW"); - - unsigned NumElts = VT.getVectorNumElements(); - - unsigned Mask = 0; - for (unsigned l = 0; l != NumElts; l += 8) { - // 8 nodes per lane, but we only care about the last 4. - for (unsigned i = 0; i < 4; ++i) { - int Elt = N->getMaskElt(l+i+4); - if (Elt < 0) continue; - Elt &= 0x3; // only 2-bits. - Mask |= Elt << (i * 2); - } - } - - return Mask; -} - -/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. -static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT == MVT::v8i16 || VT == MVT::v16i16) && - "Unsupported vector type for PSHUFHW"); - - unsigned NumElts = VT.getVectorNumElements(); - - unsigned Mask = 0; - for (unsigned l = 0; l != NumElts; l += 8) { - // 8 nodes per lane, but we only care about the first 4. - for (unsigned i = 0; i < 4; ++i) { - int Elt = N->getMaskElt(l+i); - if (Elt < 0) continue; - Elt &= 0x3; // only 2-bits - Mask |= Elt << (i * 2); - } - } - - return Mask; -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with -/// VALIGN (if Interlane is true) instructions. -static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, - bool InterLane) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = InterLane ? 1 : - VT.getVectorElementType().getSizeInBits() >> 3; - - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - int Val = 0; - unsigned i; - for (i = 0; i != NumElts; ++i) { - Val = SVOp->getMaskElt(i); - if (Val >= 0) - break; - } - if (Val >= (int)NumElts) - Val -= NumElts - NumLaneElts; - - assert(Val - i > 0 && "PALIGNR imm should be positive"); - return (Val - i) * EltSize; -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the PALIGNR instruction. -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - return getShuffleAlignrImmediate(SVOp, false); -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the VALIGN instruction. -static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { - return getShuffleAlignrImmediate(SVOp, true); -} - - static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4947,119 +4048,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// ShouldXformToMOVHLPS - Return true if the node should be transformed to -/// match movhlps. The lower half elements should come from upper half of -/// V1 (and in order), and the upper half elements should come from the upper -/// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - if (VT.getVectorNumElements() != 4) - return false; - for (unsigned i = 0, e = 2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i+2)) - return false; - for (unsigned i = 2; i != 4; ++i) - if (!isUndefOrEqual(Mask[i], i+4)) - return false; - return true; -} - -/// isScalarLoadToVector - Returns true if the node is a scalar load that -/// is promoted to a vector. It also returns the LoadSDNode by reference if -/// required. -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { - if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) - return false; - N = N->getOperand(0).getNode(); - if (!ISD::isNON_EXTLoad(N)) - return false; - if (LD) - *LD = cast<LoadSDNode>(N); - return true; -} - -// Test whether the given value is a vector value which will be legalized -// into a load. -static bool WillBeConstantPoolLoad(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - // Check for any non-constant elements. - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - switch (N->getOperand(i).getNode()->getOpcode()) { - case ISD::UNDEF: - case ISD::ConstantFP: - case ISD::Constant: - break; - default: - return false; - } - - // Vectors of all-zeros and all-ones are materialized with special - // instructions rather than being loaded. - return !ISD::isBuildVectorAllZeros(N) && - !ISD::isBuildVectorAllOnes(N); -} - -/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to -/// match movlp{s|d}. The lower half elements should come from lower half of -/// V1 (and in order), and the upper half elements should come from the upper -/// half of V2 (and in order). And since V1 will become the source of the -/// MOVLP, it must be either a vector load or a scalar load to vector. -static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) - return false; - // Is V2 is a vector load, don't do this transformation. We will try to use - // load folding shufps op. - if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i+NumElems)) - return false; - return true; -} - -/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved -/// to an zero vector. -/// FIXME: move to dag combiner / method on ShuffleVectorSDNode -static bool isZeroShuffle(ShuffleVectorSDNode *N) { - SDValue V1 = N->getOperand(0); - SDValue V2 = N->getOperand(1); - unsigned NumElems = N->getValueType(0).getVectorNumElements(); - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = N->getMaskElt(i); - if (Idx >= (int)NumElems) { - unsigned Opc = V2.getOpcode(); - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) - continue; - if (Opc != ISD::BUILD_VECTOR || - !X86::isZeroNode(V2.getOperand(Idx-NumElems))) - return false; - } else if (Idx >= 0) { - unsigned Opc = V1.getOpcode(); - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) - continue; - if (Opc != ISD::BUILD_VECTOR || - !X86::isZeroNode(V1.getOperand(Idx))) - return false; - } - } - return true; -} - /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, @@ -5131,16 +4119,6 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } -/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements -/// that point to V2 points to its first element. -static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - if (Mask[i] > (int)NumElems) { - Mask[i] = NumElems; - } - } -} - /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, @@ -5177,92 +4155,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by -// a generic shuffle instruction because the target has no such instructions. -// Generate shuffles which repeat i16 and i8 several times until they can be -// represented by v4f32 and then be manipulated by target suported shuffles. -static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { - MVT VT = V.getSimpleValueType(); - int NumElems = VT.getVectorNumElements(); - SDLoc dl(V); - - while (NumElems > 4) { - if (EltNo < NumElems/2) { - V = getUnpackl(DAG, dl, VT, V, V); - } else { - V = getUnpackh(DAG, dl, VT, V, V); - EltNo -= NumElems/2; - } - NumElems >>= 1; - } - return V; -} - -/// getLegalSplat - Generate a legal splat with supported x86 shuffles -static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { - MVT VT = V.getSimpleValueType(); - SDLoc dl(V); - - if (VT.is128BitVector()) { - V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); - int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; - V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), - &SplatMask[0]); - } else if (VT.is256BitVector()) { - // To use VPERMILPS to splat scalars, the second half of indicies must - // refer to the higher part, which is a duplication of the lower one, - // because VPERMILPS can only handle in-lane permutations. - int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, - EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; - - V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); - V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), - &SplatMask[0]); - } else - llvm_unreachable("Vector size not supported"); - - return DAG.getNode(ISD::BITCAST, dl, VT, V); -} - -/// PromoteSplat - Splat is promoted to target supported vector shuffles. -static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { - MVT SrcVT = SV->getSimpleValueType(0); - SDValue V1 = SV->getOperand(0); - SDLoc dl(SV); - - int EltNo = SV->getSplatIndex(); - int NumElems = SrcVT.getVectorNumElements(); - bool Is256BitVec = SrcVT.is256BitVector(); - - assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && - "Unknown how to promote splat for type"); - - // Extract the 128-bit part containing the splat element and update - // the splat element index when it refers to the higher register. - if (Is256BitVec) { - V1 = Extract128BitVector(V1, EltNo, DAG, dl); - if (EltNo >= NumElems/2) - EltNo -= NumElems/2; - } - - // All i16 and i8 vector types can't be used directly by a generic shuffle - // instruction because the target has no such instruction. Generate shuffles - // which repeat i16 and i8 several times until they fit in i32, and then can - // be manipulated by target suported shuffles. - MVT EltVT = SrcVT.getVectorElementType(); - if (EltVT == MVT::i8 || EltVT == MVT::i16) - V1 = PromoteSplati8i16(V1, DAG, EltNo); - - // Recreate the 256-bit vector and place the same 128-bit vector - // into the low and high part. This is necessary because we want - // to use VPERM* to shuffle the vectors - if (Is256BitVec) { - V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); - } - - return getLegalSplat(DAG, V1, EltNo); -} - /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified /// vector of zero or undef vector. This produces a shuffle where the low /// element of V2 is swizzled into the zero/undef vector, landing at element @@ -5394,13 +4286,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return false; if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { - // FIXME: Support AVX-512 here. - Type *Ty = C->getType(); - if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 && - Ty->getVectorNumElements() != 32)) - return false; - DecodePSHUFBMask(C, Mask); + if (Mask.empty()) + return false; break; } @@ -5412,16 +4300,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, IsUnary = true; break; case X86ISD::MOVSS: - case X86ISD::MOVSD: { - // The index 0 always comes from the first element of the second source, - // this is why MOVSS and MOVSD are used in the first place. The other - // elements come from the other positions of the first source vector - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) { - Mask.push_back(i); - } + case X86ISD::MOVSD: + DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; - } case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); @@ -5429,11 +4310,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVDDUP: + DecodeMOVDDUPMask(VT, Mask); + IsUnary = true; + break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: @@ -5517,148 +4403,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// getNumOfConsecutiveZeros - Return the number of elements of a vector -/// shuffle operation which come from a consecutively from a zero. The -/// search can start in two different directions, from left or right. -/// We count undefs as zeros until PreferredNum is reached. -static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, - unsigned NumElems, bool ZerosFromLeft, - SelectionDAG &DAG, - unsigned PreferredNum = -1U) { - unsigned NumZeros = 0; - for (unsigned i = 0; i != NumElems; ++i) { - unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; - SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); - if (!Elt.getNode()) - break; - - if (X86::isZeroNode(Elt)) - ++NumZeros; - else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. - NumZeros = std::min(NumZeros + 1, PreferredNum); - else - break; - } - - return NumZeros; -} - -/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) -/// correspond consecutively to elements from one of the vector operands, -/// starting from its index OpIdx. Also tell OpNum which source vector operand. -static -bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, - unsigned MaskI, unsigned MaskE, unsigned OpIdx, - unsigned NumElems, unsigned &OpNum) { - bool SeenV1 = false; - bool SeenV2 = false; - - for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { - int Idx = SVOp->getMaskElt(i); - // Ignore undef indicies - if (Idx < 0) - continue; - - if (Idx < (int)NumElems) - SeenV1 = true; - else - SeenV2 = true; - - // Only accept consecutive elements from the same vector - if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) - return false; - } - - OpNum = SeenV1 ? 0 : 1; - return true; -} - -/// isVectorShiftRight - Returns true if the shuffle can be implemented as a -/// logical left shift of a vector. -static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = - SVOp->getSimpleValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros( - SVOp, NumElems, false /* check zeros from right */, DAG, - SVOp->getMaskElt(0)); - unsigned OpSrc; - - if (!NumZeros) - return false; - - // Considering the elements in the mask that are not consecutive zeros, - // check if they consecutively come from only one of the source vectors. - // - // V1 = {X, A, B, C} 0 - // \ \ \ / - // vector_shuffle V1, V2 <1, 2, 3, X> - // - if (!isShuffleMaskConsecutive(SVOp, - 0, // Mask Start Index - NumElems-NumZeros, // Mask End Index(exclusive) - NumZeros, // Where to start looking in the src vector - NumElems, // Number of elements in vector - OpSrc)) // Which source operand ? - return false; - - isLeft = false; - ShAmt = NumZeros; - ShVal = SVOp->getOperand(OpSrc); - return true; -} - -/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a -/// logical left shift of a vector. -static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = - SVOp->getSimpleValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros( - SVOp, NumElems, true /* check zeros from left */, DAG, - NumElems - SVOp->getMaskElt(NumElems - 1) - 1); - unsigned OpSrc; - - if (!NumZeros) - return false; - - // Considering the elements in the mask that are not consecutive zeros, - // check if they consecutively come from only one of the source vectors. - // - // 0 { A, B, X, X } = V2 - // / \ / / - // vector_shuffle V1, V2 <X, X, 4, 5> - // - if (!isShuffleMaskConsecutive(SVOp, - NumZeros, // Mask Start Index - NumElems, // Mask End Index(exclusive) - 0, // Where to start looking in the src vector - NumElems, // Number of elements in vector - OpSrc)) // Which source operand ? - return false; - - isLeft = true; - ShAmt = NumZeros; - ShVal = SVOp->getOperand(OpSrc); - return true; -} - -/// isVectorShift - Returns true if the shuffle can be implemented as a -/// logical left or right shift of a vector. -static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - // Although the logic below support any bitwidth size, there are no - // shift instructions which handle more than 128-bit vectors. - if (!SVOp->getSimpleValueType(0).is128BitVector()) - return false; - - if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || - isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) - return true; - - return false; -} - /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, @@ -5744,19 +4488,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. - bool Zeroable[4]; + std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); } - assert(std::count_if(&Zeroable[0], &Zeroable[4], - [](bool M) { return !M; }) > 1 && + assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; - for (int i=0; i < 4; ++i) { + unsigned FirstNonZeroIdx; + for (unsigned i=0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op->getOperand(i); @@ -5767,8 +4511,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); - if (!FirstNonZero.getNode()) + if (!FirstNonZero.getNode()) { FirstNonZero = Elt; + FirstNonZeroIdx = i; + } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); @@ -5807,14 +4553,14 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, return SDValue(); SDValue V2 = Elt.getOperand(0); - if (Elt == FirstNonZero) + if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; - + SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) @@ -5833,10 +4579,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. - unsigned ZMask = 0; - for (int i = 0; i < 4; ++i) - if (Zeroable[i]) - ZMask |= 1 << i; + unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); @@ -5845,19 +4588,19 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } -/// getVShift - Return a vector logical shift node. -/// +/// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); - EVT ShVT = MVT::v2i64; + MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); + assert(NumBits % 8 == 0 && "Only support byte sized shifts"); + SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy); return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opc, dl, ShVT, SrcOp, - DAG.getConstant(NumBits, - TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); + DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue @@ -5924,9 +4667,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector<int, 8> Mask; - for (unsigned i = 0; i != NumElems; ++i) - Mask.push_back(EltNo); + SmallVector<int, 8> Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -5934,19 +4675,18 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); } -/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a -/// vector of type 'VT', see if the elements can be replaced by a single large -/// load which has the same value as a build_vector whose operands are 'elts'. +/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the +/// elements can be replaced by a single large load which has the same value as +/// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. -static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, +static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { - EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; @@ -5957,7 +4697,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; - + // Look through a bitcast. + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) + Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); @@ -5972,7 +4714,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, continue; LoadSDNode *LD = cast<LoadSDNode>(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + EVT LdVT = Elt.getValueType(); + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) + return SDValue(); + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } @@ -5981,6 +4728,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + assert(LDBase && "Did not find base load for merging consecutive loads"); + EVT EltVT = LDBase->getValueType(0); + // Ensure that the input vector size for the merged loads matches the + // cumulative size of the input elements. + if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) + return SDValue(); if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) @@ -5988,15 +4741,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue NewLd = SDValue(); - if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + LDBase->getPointerInfo(), LDBase->isVolatile(), + LDBase->isNonTemporal(), LDBase->isInvariant(), + LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, @@ -6009,7 +4757,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return NewLd; } - if (NumElems == 4 && LastLoadedElt == 1 && + + //TODO: The code below fires only for for loading the low v2i32 / v2f32 + //of a v4i32 / v4f32. It's probably worth generalizing. + EVT EltVT = VT.getVectorElementType(); + if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -6134,8 +4886,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -6183,7 +4934,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -6339,8 +5091,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { AllContants = false; NonConstIdx = idx; NumNonConsts++; - } - else { + } else { NumConsts++; if (cast<ConstantSDNode>(In)->getZExtValue()) Immediate |= (1ULL << idx); @@ -6363,7 +5114,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT::getIntegerVT(VT.getSizeInBits())); DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); } - else + else DstVec = DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(NonConstIdx), @@ -6386,7 +5137,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. -/// +/// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal @@ -6407,7 +5158,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); - + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; @@ -6476,13 +5227,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by -/// a concat_vector. +/// a concat_vector. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two -/// horizontal binary operations. +/// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// @@ -6566,7 +5317,7 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, bool AddFound = false; bool SubFound = false; - for (unsigned i = 0, e = NumElts; i != e; i++) { + for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. @@ -6676,18 +5427,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } - + if (!Subtarget->hasAVX()) return SDValue(); @@ -6738,7 +5489,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); - + // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) @@ -6863,32 +5614,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); EVT VecVT = MVT::v4i32; - unsigned VecElts = 4; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - - // If using the new shuffle lowering, just directly insert this. - if (ExperimentalVectorShuffleLowering) - return DAG.getNode( - ISD::BITCAST, dl, VT, - getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); - - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - - // Now we have our 32-bit value zero extended in the low element of - // a vector. If Idx != 0, swizzle it into place. - if (Idx != 0) { - SmallVector<int, 4> Mask; - Mask.push_back(Idx); - for (unsigned i = 1; i != VecElts; ++i) - Mask.push_back(i); - Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), - &Mask[0]); - } - return DAG.getNode(ISD::BITCAST, dl, VT, Item); + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); } } @@ -6948,17 +5681,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - - // If using the new shuffle lowering, just directly insert this. - if (ExperimentalVectorShuffleLowering) - return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); - - // Turn it into a shuffle of zero and zero-extended scalar to vector. - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); - SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i != NumElems; ++i) - MaskVec.push_back(i == Idx ? 0 : 1); - return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } @@ -6982,12 +5705,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); - // For AVX-length vectors, build the individual 128-bit pieces and use + // For AVX-length vectors, see if we can use a vector load to get all of the + // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { - SmallVector<SDValue, 64> V; - for (unsigned i = 0; i != NumElems; ++i) - V.push_back(Op.getOperand(i)); + SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); + + // Check for a build vector of consecutive loads. + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) + return LD; EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); @@ -7091,7 +5817,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41()) { + if (Subtarget->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -7271,38 +5997,40 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return true; } -// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC -// 2013 will allow us to use it as a non-type template parameter. -namespace { - -/// \brief Implementation of the \c isShuffleEquivalent variadic functor. -/// -/// See its documentation for details. -bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { - if (Mask.size() != Args.size()) - return false; - for (int i = 0, e = Mask.size(); i < e; ++i) { - assert(*Args[i] >= 0 && "Arguments must be positive integers!"); - if (Mask[i] != -1 && Mask[i] != *Args[i]) - return false; - } - return true; -} - -} // namespace - /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// -/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } +/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. -static const VariadicFunction1< - bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; +static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, + ArrayRef<int> ExpectedMask) { + if (Mask.size() != ExpectedMask.size()) + return false; + + int Size = Mask.size(); + + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); + auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); + + for (int i = 0; i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { + auto *MaskBV = Mask[i] < Size ? BV1 : BV2; + auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; + if (!MaskBV || !ExpectedBV || + MaskBV->getOperand(Mask[i] % Size) != + ExpectedBV->getOperand(ExpectedMask[i] % Size)) + return false; + } + + return true; +} /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// @@ -7328,6 +6056,37 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, return DAG.getConstant(Imm, MVT::i8); } +/// \brief Try to emit a blend instruction for a shuffle using bit math. +/// +/// This is used as a fallback approach when first class blend instructions are +/// unavailable. Currently it is only suitable for integer vectors, but could +/// be generalized for floating point vectors if desirable. +static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.isInteger() && "Only supports integer vector types!"); + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + SDValue Zero = DAG.getConstant(0, EltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT); + SmallVector<SDValue, 16> MaskOps; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) + return SDValue(); // Shuffled input! + MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); + } + + SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); + V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); + // We have to cast V2 around. + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::ANDNP, DL, MaskVT, + DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask), + DAG.getNode(ISD::BITCAST, DL, MaskVT, V2))); + return DAG.getNode(ISD::OR, DL, VT, V1, V2); +} + /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -7338,7 +6097,6 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= Size) { @@ -7415,11 +6173,17 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } } // FALLTHROUGH + case MVT::v16i8: case MVT::v32i8: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + "256-bit byte-blends require AVX2 support!"); + // Scale the blend by the number of bytes per element. - int Scale = VT.getScalarSizeInBits() / 8; - assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + int Scale = VT.getScalarSizeInBits() / 8; + + // This form of blend is always done on bytes. Compute the byte vector + // type. + MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code @@ -7432,19 +6196,19 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. - SDValue VSELECTMask[32]; + SmallVector<SDValue, 32> VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) - VSELECTMask[Scale * i + j] = + VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8)); - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); return DAG.getNode( ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), V1, V2)); } @@ -7453,12 +6217,45 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } } -/// \brief Generic routine to lower a shuffle and blend as a decomposed set of -/// unblended shuffles followed by an unshuffled blend. +/// \brief Try to lower as a blend of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can blend elements from two inputs and +/// then reduce the shuffle to a single-input permutation. +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // We build up the blend mask while checking whether a blend is a viable way + // to reduce the shuffle. + SmallVector<int, 32> BlendMask(Mask.size(), -1); + SmallVector<int, 32> PermuteMask(Mask.size(), -1); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); + + if (BlendMask[Mask[i] % Size] == -1) + BlendMask[Mask[i] % Size] = Mask[i]; + else if (BlendMask[Mask[i] % Size] != Mask[i]) + return SDValue(); // Can't blend in the needed input! + + PermuteMask[i] = Mask[i] % Size; + } + + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); +} + +/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend -/// operations. +/// operations. It will try to pick the best arrangement of shuffles and +/// blends. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, @@ -7478,6 +6275,16 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, BlendMask[i] = i + Size; } + // Try to lower with the simpler initial blend strategy unless one of the + // input shuffles would be a no-op. We prefer to shuffle inputs as the + // shuffle may be able to fold with a load or other benefit. However, when + // we'll have to do 2x as many shuffles in order to achieve this, blending + // first is a better strategy. + if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) + if (SDValue BlendPerm = + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + return BlendPerm; + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); @@ -7492,15 +6299,13 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: -/// +/// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] -/// +/// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -/// -/// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, @@ -7508,6 +6313,10 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] @@ -7517,44 +6326,52 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] == -1) - continue; - assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < NumLaneElts; ++i) { + if (Mask[l + i] == -1) + continue; + assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); - // Based on the mod-Size value of this mask element determine where - // a rotated vector would have started. - int StartIdx = i - (Mask[i] % Size); - if (StartIdx == 0) - // The identity rotation isn't interesting, stop. - return SDValue(); + // Get the mod-Size index and lane correct it. + int LaneIdx = (Mask[l + i] % NumElts) - l; + // Make sure it was in this lane. + if (LaneIdx < 0 || LaneIdx >= NumLaneElts) + return SDValue(); - // If we found the tail of a vector the rotation must be the missing - // front. If we found the head of a vector, it must be how much of the head. - int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; + // Determine where a rotated vector would have started. + int StartIdx = i - LaneIdx; + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); - if (Rotation == 0) - Rotation = CandidateRotation; - else if (Rotation != CandidateRotation) - // The rotations don't match, so we can't match this mask. - return SDValue(); + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the + // head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; - // Compute which value this mask is pointing at. - SDValue MaskV = Mask[i] < Size ? V1 : V2; - - // Compute which of the two target values this index should be assigned to. - // This reflects whether the high elements are remaining or the low elements - // are remaining. - SDValue &TargetV = StartIdx < 0 ? Hi : Lo; - - // Either set up this value if we've not encountered it before, or check - // that it remains consistent. - if (!TargetV) - TargetV = MaskV; - else if (TargetV != MaskV) - // This may be a rotation, but it pulls from the inputs in some - // unsupported interleaving. - return SDValue(); + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; + + // Compute which of the two target values this index should be assigned + // to. This reflects whether the high elements are remaining or the low + // elements are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } } // Check that we successfully analyzed the mask, and normalize the results. @@ -7565,26 +6382,27 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, else if (!Hi) Hi = Lo; - assert(VT.getSizeInBits() == 128 && - "Rotate-based lowering only supports 128-bit lowering!"); - assert(Mask.size() <= 16 && - "Can shuffle at most 16 bytes in a 128-bit vector!"); - // The actual rotate instruction rotates bytes, so we need to scale the - // rotation based on how many bytes are in the vector. - int Scale = 16 / Mask.size(); + // rotation based on how many bytes are in the vector lane. + int Scale = 16 / NumLaneElts; - // SSSE3 targets can use the palignr instruction + // SSSE3 targets can use the palignr instruction. if (Subtarget->hasSSSE3()) { - // Cast the inputs to v16i8 to match PALIGNR. - Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); - Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + // Cast the inputs to i8 vector of correct length to match PALIGNR. + MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); + Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi); return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, + DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, DAG.getConstant(Rotation * Scale, MVT::i8))); } + assert(VT.getSizeInBits() == 128 && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; @@ -7594,9 +6412,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, - DAG.getConstant(8 * LoByteShift, MVT::i8)); + DAG.getConstant(LoByteShift, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, - DAG.getConstant(8 * HiByteShift, MVT::i8)); + DAG.getConstant(HiByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } @@ -7613,6 +6431,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); @@ -7624,10 +6447,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, continue; } - // If this is an index into a build_vector node, dig out the input value and - // use it. + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR) + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) continue; SDValue Input = V.getOperand(M % Size); @@ -7640,85 +6463,133 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } -/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). -/// -/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 -/// byte-shift instructions. The mask must consist of a shifted sequential -/// shuffle from one of the input vectors and zeroable elements for the -/// remaining 'shifted in' elements. +/// \brief Try to emit a bitmask instruction for a shuffle. /// -/// Note that this only handles 128-bit vector widths currently. -static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { - assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero); + AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and +/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function +/// matches elements from one of the input vectors shuffled to the left or +/// right with zeroable elements 'shifted in'. It handles both the strictly +/// bit-wise element shifts and the byte shift across an entire 128-bit double +/// quad word lane. +/// +/// PSHL : (little-endian) left bit shift. +/// [ zz, 0, zz, 2 ] +/// [ -1, 4, zz, -1 ] +/// PSRL : (little-endian) right bit shift. +/// [ 1, zz, 3, zz] +/// [ -1, -1, 7, zz] +/// PSLLDQ : (little-endian) left byte shift +/// [ zz, 0, 1, 2, 3, 4, 5, 6] +/// [ zz, zz, -1, -1, 2, 3, 4, -1] +/// [ zz, zz, zz, zz, zz, zz, -1, 1] +/// PSRLDQ : (little-endian) right byte shift +/// [ 5, 6, 7, zz, zz, zz, zz, zz] +/// [ -1, 5, 6, 7, zz, zz, zz, zz] +/// [ 1, 2, -1, -1, -1, -1, zz, zz] +static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Size = Mask.size(); - int Scale = 16 / Size; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + auto CheckZeros = [&](int Shift, int Scale, bool Left) { + for (int i = 0; i < Size; i += Scale) + for (int j = 0; j < Shift; ++j) + if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) + return false; - auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset, - ArrayRef<int> Mask) { - for (int i = StartIndex; i < EndIndex; i++) { - if (Mask[i] < 0) - continue; - if (i + Base != Mask[i] - MaskOffset) - return false; - } return true; }; - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask); - bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { + for (int i = 0; i != Size; i += Scale) { + unsigned Pos = Left ? i + Shift : i; + unsigned Low = Left ? i : i + Shift; + unsigned Len = Scale - Shift; + if (!isSequentialOrUndefInRange(Mask, Pos, Len, + Low + (V == V1 ? 0 : Size))) + return SDValue(); } - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } + int ShiftEltBits = VT.getScalarSizeInBits() * Scale; + bool ByteShift = ShiftEltBits > 64; + unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) + : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); + int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); - if (ZeroableLeft) { - bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask); - bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask); + // Normalize the scale for byte shifts to still produce an i64 element + // type. + Scale = ByteShift ? Scale / 2 : Scale; - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } - } - } + // We need to round trip through the appropriate type for the shift. + MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); + MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); + assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && + "Illegal integer vector type"); + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + + // SSE/AVX supports logical shifts up to 64-bit integers - so we can just + // keep doubling the size of the integer elements up to that. We can + // then shift the elements of the integer vector by whole multiples of + // their width within the elements of the larger integer vector. Test each + // multiple to see if we can find a match with the moved element indices + // and that the shifted in elements are all zeroable. + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) + for (int Shift = 1; Shift != Scale; ++Shift) + for (bool Left : {true, false}) + if (CheckZeros(Shift, Scale, Left)) + for (SDValue V : {V1, V2}) + if (SDValue Match = MatchShift(Shift, Scale, Left, V)) + return Match; + + // no match return SDValue(); } @@ -7728,10 +6599,11 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, /// stride, produce either a zero or any extension based on the available /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, + SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); - int EltBits = VT.getSizeInBits() / NumElements; + int NumElements = VT.getVectorNumElements(); + int EltBits = VT.getScalarSizeInBits(); assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); @@ -7739,10 +6611,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { - MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); } @@ -7800,7 +6670,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( return DAG.getNode(ISD::BITCAST, DL, VT, InputV); } -/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. +/// \brief Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't @@ -7818,7 +6688,10 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); - int NumElements = Mask.size(); + int NumElements = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() <= 32 && + "Exceeds 32-bit integer zero extension limit"); + assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. @@ -7829,11 +6702,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (Mask[i] == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { - // Each of the extend elements needs to be zeroable. + // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); - // We no lorger are in the anyext case. + // We no longer are in the anyext case. AnyExt = false; continue; } @@ -7847,7 +6720,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); // Flip-flopping inputs. if (Mask[i] % NumElements != i / Scale) - return SDValue(); // Non-consecutive strided elemenst. + return SDValue(); // Non-consecutive strided elements. } // If we fail to find an input, we have a zero-shuffle which should always @@ -7857,7 +6730,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); + DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7869,11 +6742,34 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && - "The input vector size must be divisble by the extended size."); + "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } + // General extends failed, but 128-bit vectors may be able to use MOVQ. + if (Bits != 128) + return SDValue(); + + // Returns one of the source operands if the shuffle can be reduced to a + // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. + auto CanZExtLowHalf = [&]() { + for (int i = NumElements / 2; i != NumElements; ++i) + if (!Zeroable[i]) + return SDValue(); + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) + return V1; + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) + return V2; + return SDValue(); + }; + + if (SDValue V = CanZExtLowHalf()) { + V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + } + // No viable ext lowering found. return SDValue(); } @@ -7916,7 +6812,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( - MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; @@ -7983,6 +6879,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( ExtVT, V1, V2); } + // This lowering only works for the low element with floating point vectors. + if (VT.isFloatingPoint() && V2Index != 0) + return SDValue(); + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); @@ -8001,7 +6901,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant( - V2Index * EltVT.getSizeInBits(), + V2Index * EltVT.getSizeInBits()/8, DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); } @@ -8014,7 +6914,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, +static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -8086,6 +6986,199 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); } +// Check for whether we can use INSERTPS to perform the shuffle. We only use +// INSERTPS when the V1 elements are already in the correct locations +// because otherwise we can just always use two SHUFPS instructions which +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also +// perform INSERTPS if a single V1 element is out of place and all V2 +// elements are zeroable. +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + unsigned ZMask = 0; + int V1DstIndex = -1; + int V2DstIndex = -1; + bool V1UsedInPlace = false; + + for (int i = 0; i < 4; ++i) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } + + // Flag if we use any V1 inputs in place. + if (i == Mask[i]) { + V1UsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (V1DstIndex != -1 || V2DstIndex != -1) + return SDValue(); + + if (Mask[i] < 4) { + // V1 input out of place for insertion. + V1DstIndex = i; + } else { + // V2 input for insertion. + V2DstIndex = i; + } + } + + // Don't bother if we have no (non-zeroable) element for insertion. + if (V1DstIndex == -1 && V2DstIndex == -1) + return SDValue(); + + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned V2SrcIndex = 0; + if (V1DstIndex != -1) { + // If we have a V1 input out of place, we use V1 as the V2 element insertion + // and don't use the original V2 at all. + V2SrcIndex = Mask[V1DstIndex]; + V2DstIndex = V1DstIndex; + V2 = V1; + } else { + V2SrcIndex = Mask[V2DstIndex] - 4; + } + + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!V1UsedInPlace) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + SDLoc DL(Op); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); +} + +/// \brief Try to lower a shuffle as a permute of the inputs followed by an +/// UNPCK instruction. +/// +/// This specifically targets cases where we end up with alternating between +/// the two inputs, and so can permute them into something that feeds a single +/// UNPCK instruction. Note that this routine only targets integer vectors +/// because for floating point vectors we have a generalized SHUFPS lowering +/// strategy that handles everything that doesn't *exactly* match an unpack, +/// making this clever lowering unnecessary. +static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!VT.isFloatingPoint() && + "This routine only supports integer vectors."); + assert(!isSingleInputShuffleMask(Mask) && + "This routine should only be used when blending two inputs."); + assert(Mask.size() >= 2 && "Single element masks are invalid."); + + int Size = Mask.size(); + + int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { + return M >= 0 && M % Size < Size / 2; + }); + int NumHiInputs = std::count_if( + Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); + + bool UnpackLo = NumLoInputs >= NumHiInputs; + + auto TryUnpack = [&](MVT UnpackVT, int Scale) { + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + // Each element of the unpack contains Scale elements from this mask. + int UnpackIdx = i / Scale; + + // We only handle the case where V1 feeds the first slots of the unpack. + // We rely on canonicalization to ensure this is the case. + if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) + return SDValue(); + + // Setup the mask for this input. The indexing is tricky as we have to + // handle the unpack stride. + SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; + VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = + Mask[i] % Size; + } + + // If we will have to shuffle both inputs to use the unpack, check whether + // we can just unpack first and shuffle the result. If so, skip this unpack. + if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && + !isNoopShuffleMask(V2Mask)) + return SDValue(); + + // Shuffle the inputs into place. + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + + // Cast the inputs to the type we will use to unpack them. + V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2); + + // Unpack the inputs and cast the result back to the desired type. + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, UnpackVT, V1, V2)); + }; + + // We try each unpack from the largest to the smallest to try and find one + // that fits this mask. + int OrigNumElements = VT.getVectorNumElements(); + int OrigScalarSize = VT.getScalarSizeInBits(); + for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { + int Scale = ScalarSize / OrigScalarSize; + int NumElements = OrigNumElements / Scale; + MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); + if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) + return Unpack; + } + + // If none of the unpack-rooted lowerings worked (or were profitable) try an + // initial unpack. + if (NumLoInputs == 0 || NumHiInputs == 0) { + assert((NumLoInputs > 0 || NumHiInputs > 0) && + "We have to have *some* inputs!"); + int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; + + // FIXME: We could consider the total complexity of the permute of each + // possible unpacking. Or at the least we should consider how many + // half-crossings are created. + // FIXME: We could consider commuting the unpacks. + + SmallVector<int, 32> PermMask; + PermMask.assign(Size, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); + + PermMask[i] = + 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); + } + return DAG.getVectorShuffle( + VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, + DL, VT, V1, V2), + DAG.getUNDEF(VT), PermMask); + } + + return SDValue(); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -8105,6 +7198,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Use low duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) + if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); + // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); @@ -8122,29 +7220,24 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); - // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. - if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. @@ -8158,6 +7251,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -8182,7 +7281,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -8198,37 +7297,60 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[0] < 2 && "We sort V1 to be the first input."); + assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + + // If we have a blend of two PACKUS operations an the blend aligns with the + // low and half halves, we can just merge the PACKUS operations. This is + // particularly important as it lets us merge shuffles that this routine itself + // creates. + auto GetPackNode = [](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - // If we have a single input from V2 insert that into V1 if we can do so - // cheaply. - if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) - return Insertion; - // Try inverting the insertion since for v2 masks it is easy to do and we - // can't reliably sort the mask one way or the other. - int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), - Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) - return Insertion; - } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); + }; + if (SDValue V1Pack = GetPackNode(V1)) + if (SDValue V2Pack = GetPackNode(V2)) + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; - if (Subtarget->hasSSE41()) + // When loading a scalar and then shuffling it into a vector we can often do + // the insertion cheaply. + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v2i64, V1, V2, Mask, DAG)) - return Shift; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8237,6 +7359,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, + Mask, DAG); + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -8247,6 +7375,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } +/// \brief Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. @@ -8358,10 +7504,18 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, Mask, Subtarget, DAG)) return Broadcast; + // Use even/odd duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); + } + if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. @@ -8375,70 +7529,41 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(Mask, DAG)); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return V; - if (Subtarget->hasSSE41()) + if (Subtarget->hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; - // Check for whether we can use INSERTPS to perform the blend. We only use - // INSERTPS when the V1 elements are already in the correct locations - // because otherwise we can just always use two SHUFPS instructions which - // are much smaller to encode than a SHUFPS and an INSERTPS. - if (NumV2Elements == 1 && Subtarget->hasSSE41()) { - int V2Index = - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - - Mask.begin(); - - // When using INSERTPS we can zero any lane of the destination. Collect - // the zero inputs into a mask and drop them from the lanes of V1 which - // actually need to be present as inputs to the INSERTPS. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - - // Synthesize a shuffle mask for the non-zero and non-v2 inputs. - bool InsertNeedsShuffle = false; - unsigned ZMask = 0; - for (int i = 0; i < 4; ++i) - if (i != V2Index) { - if (Zeroable[i]) { - ZMask |= 1 << i; - } else if (Mask[i] != i) { - InsertNeedsShuffle = true; - break; - } - } - - // We don't want to use INSERTPS or other insertion techniques if it will - // require shuffling anyways. - if (!InsertNeedsShuffle) { - // If all of V1 is zeroable, replace it with undef. - if ((ZMask | 1 << V2Index) == 0xF) - V1 = DAG.getUNDEF(MVT::v4f32); - - unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + // Use INSERTPS if we can complete the shuffle efficiently. + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + return V; - // Insert the V2 element into the desired position. - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, MVT::i8)); - } + if (!isSingleSHUFPSMask(Mask)) + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, MVT::v4f32, V1, V2, Mask, DAG)) + return BlendPerm; } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } @@ -8470,7 +7595,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -8481,36 +7606,47 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; - if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; - else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); } + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return V; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - - if (Subtarget->hasSSE41()) + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v4i32, V1, V2, Mask, DAG)) - return Shift; + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8519,6 +7655,17 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, + Mask, DAG); + + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Unpack; + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we @@ -8542,7 +7689,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. -static SDValue lowerV8I16SingleInputVectorShuffle( +static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); @@ -8570,27 +7717,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, - Mask, Subtarget, DAG)) - return Broadcast; - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); - if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); - - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V, V, Mask, DAG)) - return Shift; - - // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) - return Rotate; - // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -8993,158 +8119,56 @@ static SDValue lowerV8I16SingleInputVectorShuffle( return V; } -/// \brief Detect whether the mask pattern should be lowered through -/// interleaving. -/// -/// This essentially tests whether viewing the mask as an interleaving of two -/// sub-sequences reduces the cross-input traffic of a blend operation. If so, -/// lowering it through interleaving is a significantly better strategy. -static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) { - int NumEvenInputs[2] = {0, 0}; - int NumOddInputs[2] = {0, 0}; - int NumLoInputs[2] = {0, 0}; - int NumHiInputs[2] = {0, 0}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] < 0) - continue; - - int InputIdx = Mask[i] >= Size; - - if (i < Size / 2) - ++NumLoInputs[InputIdx]; - else - ++NumHiInputs[InputIdx]; - - if ((i % 2) == 0) - ++NumEvenInputs[InputIdx]; - else - ++NumOddInputs[InputIdx]; - } - - // The minimum number of cross-input results for both the interleaved and - // split cases. If interleaving results in fewer cross-input results, return - // true. - int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], - NumEvenInputs[0] + NumOddInputs[1]); - int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], - NumLoInputs[0] + NumHiInputs[1]); - return InterleavedCrosses < SplitCrosses; -} - -/// \brief Blend two v8i16 vectors using a naive unpack strategy. -/// -/// This strategy only works when the inputs from each vector fit into a single -/// half of that vector, and generally there are not so many inputs as to leave -/// the in-place shuffles required highly constrained (and thus expensive). It -/// shifts all the inputs into a single side of both input vectors and then -/// uses an unpack to interleave these inputs in a single vector. At that -/// point, we will fall back on the generic single input shuffle lowering. -static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, - SDValue V2, - MutableArrayRef<int> Mask, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); - assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); - SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; - for (int i = 0; i < 8; ++i) - if (Mask[i] >= 0 && Mask[i] < 4) - LoV1Inputs.push_back(i); - else if (Mask[i] >= 4 && Mask[i] < 8) - HiV1Inputs.push_back(i); - else if (Mask[i] >= 8 && Mask[i] < 12) - LoV2Inputs.push_back(i); - else if (Mask[i] >= 12) - HiV2Inputs.push_back(i); - - int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); - int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); - (void)NumV1Inputs; - (void)NumV2Inputs; - assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); - assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); - assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); - - bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= - HiV1Inputs.size() + HiV2Inputs.size(); - - auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs, - ArrayRef<int> HiInputs, bool MoveToLo, - int MaskOffset) { - ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs; - ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs; - if (BadInputs.empty()) - return V; - - int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int MoveOffset = MoveToLo ? 0 : 4; +/// \brief Helper to form a PSHUFB-based shuffle+blend. +static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V1Mask[16]; + SDValue V2Mask[16]; + V1InUse = false; + V2InUse = false; - if (GoodInputs.empty()) { - for (int BadInput : BadInputs) { - MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; - Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; - } + int Size = Mask.size(); + int Scale = 16 / Size; + for (int i = 0; i < 16; ++i) { + if (Mask[i / Scale] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { - if (GoodInputs.size() == 2) { - // If the low inputs are spread across two dwords, pack them into - // a single dword. - MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; - MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; - } else { - // Otherwise pin the good inputs. - for (int GoodInput : GoodInputs) - MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; - } - - if (BadInputs.size() == 2) { - // If we have two bad inputs then there may be either one or two good - // inputs fixed in place. Find a fixed input, and then find the *other* - // two adjacent indices by using modular arithmetic. - int GoodMaskIdx = - std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), - [](int M) { return M >= 0; }) - - std::begin(MoveMask); - int MoveMaskIdx = - ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; - assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); - assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; - } else { - assert(BadInputs.size() == 1 && "All sizes handled"); - int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, - std::end(MoveMask), -1) - - std::begin(MoveMask); - MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; - } - } - - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), - MoveMask); - }; - V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, - /*MaskOffset*/ 0); - V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, - /*MaskOffset*/ 8); - - // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes - // cross-half traffic in the final shuffle. + const int ZeroMask = 0x80; + int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale + : ZeroMask; + int V2Idx = Mask[i / Scale] < Size + ? ZeroMask + : (Mask[i / Scale] - Size) * Scale + i % Scale; + if (Zeroable[i / Scale]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); - // Munge the mask to be a single-input mask after the unpack merges the - // results. - for (int &M : Mask) - if (M != -1) - M = 2 * (M % 4) + (M / 8); + // If we need shuffled inputs from both, blend the two. + SDValue V; + if (V1InUse && V2InUse) + V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + else + V = V1InUse ? V1 : V2; - return DAG.getVectorShuffle( - MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V1, V2), - DAG.getUNDEF(MVT::v8i16), Mask); + // Cast the result back to the correct type. + return DAG.getNode(ISD::BITCAST, DL, VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. @@ -9181,85 +8205,95 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return ZExt; auto isV1 = [](int M) { return M >= 0 && M < 8; }; + (void)isV1; auto isV2 = [](int M) { return M >= 8; }; - int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); - if (NumV2Inputs == 0) - return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + if (NumV2Inputs == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); + if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, + Mask, Subtarget, DAG)) + return Rotate; + + return lowerV8I16GeneralSingleInputVectorShuffle(DL, V1, Mask, Subtarget, + DAG); + } - assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " - "to be V1-input shuffles."); + assert(std::any_of(Mask.begin(), Mask.end(), isV1) && + "All single-input shuffles should be canonicalized to be V1-input " + "shuffles."); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return V; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); - - if (Subtarget->hasSSE41()) + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V1, V2, Mask, DAG)) - return Shift; + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; - if (NumV1Inputs + NumV2Inputs <= 4) - return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); - - // Check whether an interleaving lowering is likely to be more efficient. - // This isn't perfect but it is a strong heuristic that tends to work well on - // the kinds of shuffles that show up in practice. - // - // FIXME: Handle 1x, 2x, and 4x interleaving. - if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return BitBlend; - int EMask[8], OMask[8]; - for (int i = 0; i < 4; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 4] = -1; - OMask[i + 4] = -1; - } + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Unpack; - SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); - SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); - - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); - } - - int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - - for (int i = 0; i < 4; ++i) { - LoBlendMask[i] = Mask[i]; - HiBlendMask[i] = Mask[i + 4]; + // If we can't directly blend but can use PSHUFB, that will be better as it + // can both shuffle and set up the inefficient blend. + if (!IsBlendSupported && Subtarget->hasSSSE3()) { + bool V1InUse, V2InUse; + return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, + V1InUse, V2InUse); } - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); - LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); - HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); - - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); + // We can always bit-blend if we have to so the fallback strategy is to + // decompose into single-input permutes and blends. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even @@ -9345,40 +8379,31 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> OrigMask = SVOp->getMask(); - assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; - int MaskStorage[16] = { - OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], - OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], - OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], - OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; - MutableArrayRef<int> Mask(MaskStorage); - MutableArrayRef<int> LoMask = Mask.slice(0, 8); - MutableArrayRef<int> HiMask = Mask.slice(8, 8); - int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -9475,36 +8500,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; } - // Check whether an interleaving lowering is likely to be more efficient. - // This isn't perfect but it is a strong heuristic that tends to work well on - // the kinds of shuffles that show up in practice. - // - // FIXME: We need to handle other interleaving widths (i16, i32, ...). - if (shouldLowerAsInterleaving(Mask)) { - int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { - return (M >= 0 && M < 8) || (M >= 16 && M < 24); - }); - int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { - return (M >= 8 && M < 16) || M >= 24; - }); - int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1}; - int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1}; - bool UnpackLo = NumLoHalf >= NumHiHalf; - MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8); - MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8); - for (int i = 0; i < 8; ++i) { - TargetEMask[i] = Mask[2 * i]; - TargetOMask[i] = Mask[2 * i + 1]; - } - - SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); - SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - - return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, - MVT::v16i8, Evens, Odds); - } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {// Low half. + 0, 16, 1, 17, 2, 18, 3, 19, + // High half. + 4, 20, 5, 21, 6, 22, 7, 23})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {// Low half. + 8, 24, 9, 25, 10, 26, 11, 27, + // High half. + 12, 28, 13, 29, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -9520,33 +8526,47 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget->hasSSSE3()) { - SDValue V1Mask[16]; - SDValue V2Mask[16]; - for (int i = 0; i < 16; ++i) - if (Mask[i] == -1) { - V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); - } else { - V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); - V2Mask[i] = - DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8); - } - V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); - if (isSingleInputShuffleMask(Mask)) - return V1; // Single inputs are easy. + bool V1InUse = false; + bool V2InUse = false; - // Otherwise, blend the two. - V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); - return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, + DAG, V1InUse, V2InUse); + + // If both V1 and V2 are in use and we can use a direct blend or an unpack, + // do so. This avoids using them to handle blends-with-zero which is + // important as a single pshufb is significantly faster for that. + if (V1InUse && V2InUse) { + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) + return Blend; + + // We can use an unpack to do the blending rather than an or in some + // cases. Even though the or may be (very minorly) more efficient, we + // preference this lowering because there are common cases where part of + // the complexity of the shuffles goes away when we do the final blend as + // an unpack. + // FIXME: It might be worth trying to detect if the unpack-feeding + // shuffles will both be pshufb, in which case we shouldn't bother with + // this. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Unpack; + } + + return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return BitBlend; + // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. @@ -9585,72 +8605,58 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Result; } - int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + // Handle multi-input cases by blending single-input shuffles. + if (NumV2Elements > 0) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, + Mask, DAG); - auto buildBlendMasks = [](MutableArrayRef<int> HalfMask, - MutableArrayRef<int> V1HalfBlendMask, - MutableArrayRef<int> V2HalfBlendMask) { - for (int i = 0; i < 8; ++i) - if (HalfMask[i] >= 0 && HalfMask[i] < 16) { - V1HalfBlendMask[i] = HalfMask[i]; - HalfMask[i] = i; - } else if (HalfMask[i] >= 16) { - V2HalfBlendMask[i] = HalfMask[i] - 16; - HalfMask[i] = i + 8; - } - }; - buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); - buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + // The fallback path for single-input shuffles widens this into two v8i16 + // vectors with unpacks, shuffles those, and then pulls them back together + // with a pack. + SDValue V = V1; - SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; ++i) + if (Mask[i] >= 0) + (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; - auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask, - MutableArrayRef<int> HiBlendMask) { - SDValue V1, V2; - // Check if any of the odd lanes in the v16i8 are used. If not, we can mask - // them out and avoid using UNPCK{L,H} to extract the elements of V as - // i16s. - if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), - [](int M) { return M >= 0 && M % 2 == 1; }) && - std::none_of(HiBlendMask.begin(), HiBlendMask.end(), - [](int M) { return M >= 0 && M % 2 == 1; })) { - // Use a mask to drop the high bytes. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); - V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, - DAG.getConstant(0x00FF, MVT::v8i16)); - - // This will be a single vector shuffle instead of a blend so nuke V2. - V2 = DAG.getUNDEF(MVT::v8i16); - - // Squash the masks to point directly into V1. - for (int &M : LoBlendMask) - if (M >= 0) - M /= 2; - for (int &M : HiBlendMask) - if (M >= 0) - M /= 2; - } else { - // Otherwise just unpack the low half of V into V1 and the high half into - // V2 so that we can blend them as i16s. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); - } + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); - SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); - SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); - return std::make_pair(BlendedLo, BlendedHi); - }; - SDValue V1Lo, V1Hi, V2Lo, V2Hi; - std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); - std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + SDValue VLoHalf, VHiHalf; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke VHiHalf. + VHiHalf = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into VLoHalf. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into VLoHalf and the high half into + // VHiHalf so that we can blend them as i16s. + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } @@ -9736,7 +8742,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, return true; } -/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all @@ -9757,14 +8763,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT ScalarVT = VT.getScalarType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); - SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(SplitNumElements)); - SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(0)); - SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(SplitNumElements)); + // Rather than splitting build-vectors, just build two narrower build + // vectors. This helps shuffling with splats and zeros. + auto SplitVector = [&](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V->getOperand(0); + + MVT OrigVT = V.getSimpleValueType(); + int OrigNumElements = OrigVT.getVectorNumElements(); + int OrigSplitNumElements = OrigNumElements / 2; + MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); + + SDValue LoV, HiV; + + auto *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV) { + LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(0)); + HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(OrigSplitNumElements)); + } else { + + SmallVector<SDValue, 16> LoOps, HiOps; + for (int i = 0; i < OrigSplitNumElements; ++i) { + LoOps.push_back(BV->getOperand(i)); + HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); + } + LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); + HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); + } + return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV), + DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV)); + }; + + SDValue LoV1, HiV1, LoV2, HiV2; + std::tie(LoV1, HiV1) = SplitVector(V1); + std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef<int> HalfMask) { @@ -9960,15 +8995,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, VT.getVectorNumElements() / 2); // Check for patterns which can be matched with a single insert of a 128-bit // subvector. - if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || - isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) || + isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } - if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, @@ -9983,6 +9018,104 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(PermMask, MVT::i8)); } +/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then +/// shuffling each lane. +/// +/// This will only succeed when the result of fixing the 128-bit lanes results +/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in +/// each 128-bit lanes. This handles many cases where we can quickly blend away +/// the lane crosses early and then use simpler shuffles within each lane. +/// +/// FIXME: It might be worthwhile at some point to support this without +/// requiring the 128-bit lane-relative shuffles to be repeating, but currently +/// in x86 only floating point has interesting non-repeating shuffles, and even +/// those are still *marginally* more expensive. +static SDValue lowerVectorShuffleByMerging128BitLanes( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && + "This is only useful with multiple inputs."); + + int Size = Mask.size(); + int LaneSize = 128 / VT.getScalarSizeInBits(); + int NumLanes = Size / LaneSize; + assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); + + // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also + // check whether the in-128-bit lane shuffles share a repeating pattern. + SmallVector<int, 4> Lanes; + Lanes.resize(NumLanes, -1); + SmallVector<int, 4> InLaneMask; + InLaneMask.resize(LaneSize, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int j = i / LaneSize; + + if (Lanes[j] < 0) { + // First entry we've seen for this lane. + Lanes[j] = Mask[i] / LaneSize; + } else if (Lanes[j] != Mask[i] / LaneSize) { + // This doesn't match the lane selected previously! + return SDValue(); + } + + // Check that within each lane we have a consistent shuffle mask. + int k = i % LaneSize; + if (InLaneMask[k] < 0) { + InLaneMask[k] = Mask[i] % LaneSize; + } else if (InLaneMask[k] != Mask[i] % LaneSize) { + // This doesn't fit a repeating in-lane mask. + return SDValue(); + } + } + + // First shuffle the lanes into place. + MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, + VT.getSizeInBits() / 64); + SmallVector<int, 8> LaneMask; + LaneMask.resize(NumLanes * 2, -1); + for (int i = 0; i < NumLanes; ++i) + if (Lanes[i] >= 0) { + LaneMask[2 * i + 0] = 2*Lanes[i] + 0; + LaneMask[2 * i + 1] = 2*Lanes[i] + 1; + } + + V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); + SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); + + // Cast it back to the type we actually want. + LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); + + // Now do a simple shuffle that isn't lane crossing. + SmallVector<int, 8> NewMask; + NewMask.resize(Size, -1); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; + assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && + "Must not introduce lane crosses at this point!"); + + return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); +} + +/// \brief Test whether the specified input (0 or 1) is in-place blended by the +/// given mask. +/// +/// This returns true if the elements from a particular input are already in the +/// slot required by the given mask and require no permutation. +static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) + return false; + + return true; +} + /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -10004,10 +9137,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, Mask, Subtarget, DAG)) return Broadcast; + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. @@ -10029,10 +9166,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. @@ -10040,7 +9181,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Insertion; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, @@ -10067,6 +9208,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getConstant(SHUFPDMask, MVT::i8)); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return Result; + // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget->hasAVX2()) @@ -10102,7 +9253,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10123,12 +9274,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } // AVX2 provides a direct instruction for permuting a single input across @@ -10137,6 +9282,31 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, DAG); @@ -10161,7 +9331,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10171,15 +9341,26 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); + + // Use even/odd duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); + if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -10214,6 +9395,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Result; + // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget->hasAVX2()) @@ -10239,12 +9426,19 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10259,12 +9453,25 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); } + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (isSingleInputShuffleMask(Mask)) { @@ -10277,6 +9484,12 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, DAG); @@ -10297,36 +9510,53 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) return Broadcast; - // There are no generalized cross-lane shuffle operations available on i16 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG); - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { @@ -10347,6 +9577,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } @@ -10366,17 +9602,18 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) return Broadcast; - // There are no generalized cross-lane shuffle operations available on i8 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, - Mask, DAG); - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10385,21 +9622,37 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Note that these are repeated 128-bit lane unpacks, not unpacks across all // 256-bit lanes. if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + V1, V2, Mask, + {// First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + V1, V2, Mask, + {// First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + SDValue PSHUFBMask[32]; for (int i = 0; i < 32; ++i) PSHUFBMask[i] = @@ -10412,6 +9665,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } @@ -10478,6 +9737,13 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); } @@ -10493,6 +9759,20 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); } @@ -10508,6 +9788,13 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); } @@ -10523,6 +9810,20 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); } @@ -10574,8 +9875,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, "Cannot lower 512-bit vectors w/ basic ISA!"); // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = + lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have supprot for @@ -10651,6 +9952,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } + // We actually see shuffles that are entirely re-arrangements of a set of + // zero inputs. This mostly happens while decomposing complex shuffles into + // simple ones. Directly lower these as a buildvector of zeros. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.all()) + return getZeroVector(VT, Subtarget, DAG, dl); + // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 @@ -10690,7 +9998,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum - // indices for V2. + // indices for V2. When those are equal, try to ensure that the number of odd + // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) @@ -10707,8 +10016,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SumV2Indices += i; else if (SVOp->getMask()[i] >= 0) SumV1Indices += i; - if (SumV2Indices < SumV1Indices) + if (SumV2Indices < SumV1Indices) { return DAG.getCommutedVectorShuffle(*SVOp); + } else if (SumV2Indices == SumV1Indices) { + int NumV1OddIndices = 0, NumV2OddIndices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + NumV2OddIndices += i % 2; + else if (SVOp->getMask()[i] >= 0) + NumV1OddIndices += i % 2; + if (NumV2OddIndices < NumV1OddIndices) + return DAG.getCommutedVectorShuffle(*SVOp); + } } } @@ -10727,1586 +10046,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, llvm_unreachable("Unimplemented!"); } - -//===----------------------------------------------------------------------===// -// Legacy vector shuffle lowering -// -// This code is the legacy code handling vector shuffles until the above -// replaces its functionality and performance. -//===----------------------------------------------------------------------===// - -static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41, - bool hasInt256, unsigned *MaskOut = nullptr) { - MVT EltVT = VT.getVectorElementType(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return false; - - if (!hasSSE41 || EltVT == MVT::i8) - return false; - if (!hasInt256 && VT == MVT::v16i16) - return false; - - unsigned MaskValue = 0; - unsigned NumElems = VT.getVectorNumElements(); - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems - 1) / 8 + 1; - unsigned NumElemsInLane = NumElems / NumLanes; - - // Blend for v16i16 should be symetric for the both lanes. - for (unsigned i = 0; i < NumElemsInLane; ++i) { - - int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; - int EltIdx = MaskVals[i]; - - if ((EltIdx < 0 || EltIdx == (int)i) && - (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) - continue; - - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx < 0 || - (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1 << i); - else - return false; - } - - if (MaskOut) - *MaskOut = MaskValue; - return true; -} - -// Try to lower a shuffle node into a simple blend instruction. -// This function assumes isBlendMask returns true for this -// SuffleVectorSDNode -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - unsigned MaskValue, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), - Subtarget->hasInt256() && "Trying to lower a " - "VECTOR_SHUFFLE to a Blend but " - "with the wrong mask")); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); - - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); - } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); -} - -/// In vector type \p VT, return true if the element at index \p InputIdx -/// falls on a different 128-bit lane than \p OutputIdx. -static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, - unsigned OutputIdx) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; -} - -/// Generate a PSHUFB if possible. Selects elements from \p V1 according to -/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to -/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p -/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a -/// zero. -static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl, - SelectionDAG &DAG) { - MVT VT = V1.getSimpleValueType(); - assert(VT.is128BitVector() || VT.is256BitVector()); - - MVT EltVT = VT.getVectorElementType(); - unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; - unsigned NumElts = VT.getVectorNumElements(); - - SmallVector<SDValue, 32> PshufbMask; - for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { - int InputIdx = MaskVals[OutputIdx]; - unsigned InputByteIdx; - - if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) - InputByteIdx = 0x80; - else { - // Cross lane is not allowed. - if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) - return SDValue(); - InputByteIdx = InputIdx * EltSizeInBytes; - // Index is an byte offset within the 128-bit lane. - InputByteIdx &= 0xf; - } - - for (unsigned j = 0; j < EltSizeInBytes; ++j) { - PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); - if (InputByteIdx != 0x80) - ++InputByteIdx; - } - } - - MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); - if (ShufVT != VT) - V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); - return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); -} - -// v8i16 shuffles - Prefer shuffles in the following order: -// 1. [all] pshuflw, pshufhw, optional move -// 2. [ssse3] 1 x pshufb -// 3. [ssse3] 2 x pshufb + 1 x por -// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -static SDValue -LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - SmallVector<int, 8> MaskVals; - - // Determine if more than 1 of the words in each of the low and high quadwords - // of the result come from the same quadword of one of the two inputs. Undef - // mask values count as coming from any quadword, for better codegen. - // - // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input - // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. - unsigned LoQuad[] = { 0, 0, 0, 0 }; - unsigned HiQuad[] = { 0, 0, 0, 0 }; - // Indices of quads used. - std::bitset<4> InputQuads; - for (unsigned i = 0; i < 8; ++i) { - unsigned *Quad = i < 4 ? LoQuad : HiQuad; - int EltIdx = SVOp->getMaskElt(i); - MaskVals.push_back(EltIdx); - if (EltIdx < 0) { - ++Quad[0]; - ++Quad[1]; - ++Quad[2]; - ++Quad[3]; - continue; - } - ++Quad[EltIdx / 4]; - InputQuads.set(EltIdx / 4); - } - - int BestLoQuad = -1; - unsigned MaxQuad = 1; - for (unsigned i = 0; i < 4; ++i) { - if (LoQuad[i] > MaxQuad) { - BestLoQuad = i; - MaxQuad = LoQuad[i]; - } - } - - int BestHiQuad = -1; - MaxQuad = 1; - for (unsigned i = 0; i < 4; ++i) { - if (HiQuad[i] > MaxQuad) { - BestHiQuad = i; - MaxQuad = HiQuad[i]; - } - } - - // For SSSE3, If all 8 words of the result come from only 1 quadword of each - // of the two input vectors, shuffle them into one input vector so only a - // single pshufb instruction is necessary. If there are more than 2 input - // quads, disable the next transformation since it does not help SSSE3. - bool V1Used = InputQuads[0] || InputQuads[1]; - bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3()) { - if (InputQuads.count() == 2 && V1Used && V2Used) { - BestLoQuad = InputQuads[0] ? 0 : 1; - BestHiQuad = InputQuads[2] ? 2 : 3; - } - if (InputQuads.count() > 2) { - BestLoQuad = -1; - BestHiQuad = -1; - } - } - - // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update - // the shuffle mask. If a quad is scored as -1, that means that it contains - // words from all 4 input quadwords. - SDValue NewV; - if (BestLoQuad >= 0 || BestHiQuad >= 0) { - int MaskV[] = { - BestLoQuad < 0 ? 0 : BestLoQuad, - BestHiQuad < 0 ? 1 : BestHiQuad - }; - NewV = DAG.getVectorShuffle(MVT::v2i64, dl, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); - NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); - - // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the - // source words for the shuffle, to aid later transformations. - bool AllWordsInNewV = true; - bool InOrder[2] = { true, true }; - for (unsigned i = 0; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx != (int)i) - InOrder[i/4] = false; - if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) - continue; - AllWordsInNewV = false; - break; - } - - bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; - if (AllWordsInNewV) { - for (int i = 0; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx < 0) - continue; - idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; - if ((idx != i) && idx < 4) - pshufhw = false; - if ((idx != i) && idx > 3) - pshuflw = false; - } - V1 = NewV; - V2Used = false; - BestLoQuad = 0; - BestHiQuad = 1; - } - - // If we've eliminated the use of V2, and the new mask is a pshuflw or - // pshufhw, that's as cheap as it gets. Return the new shuffle. - if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { - unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; - unsigned TargetMask = 0; - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, - DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): - getShufflePSHUFLWImmediate(SVOp); - V1 = NewV.getOperand(0); - return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); - } - } - - // Promote splats to a larger type which usually leads to more efficient code. - // FIXME: Is this true if pshufb is available? - if (SVOp->isSplat()) - return PromoteSplat(SVOp, DAG); - - // If we have SSSE3, and all words of the result are from 1 input vector, - // case 2 is generated, otherwise case 3 is generated. If no SSSE3 - // is present, fall back to case 4. - if (Subtarget->hasSSSE3()) { - SmallVector<SDValue,16> pshufbMask; - - // If we have elements from both input vectors, set the high bit of the - // shuffle mask element to zero out elements that come from V2 in the V1 - // mask, and elements that come from V1 in the V2 mask, so that the two - // results can be OR'd together. - bool TwoInputs = V1Used && V2Used; - V1 = getPSHUFB(MaskVals, V1, dl, DAG); - if (!TwoInputs) - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - - // Calculate the shuffle mask for the second input, shuffle it, and - // OR it with the first shuffled input. - CommuteVectorShuffleMask(MaskVals, 8); - V2 = getPSHUFB(MaskVals, V2, dl, DAG); - V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - } - - // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, - // and update MaskVals with new element order. - std::bitset<8> InOrder; - if (BestLoQuad >= 0) { - int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; - for (int i = 0; i != 4; ++i) { - int idx = MaskVals[i]; - if (idx < 0) { - InOrder.set(i); - } else if ((idx / 4) == BestLoQuad) { - MaskV[i] = idx & 3; - InOrder.set(i); - } - } - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), - &MaskV[0]); - - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, - NewV.getOperand(0), - getShufflePSHUFLWImmediate(SVOp), DAG); - } - } - - // If BestHi >= 0, generate a pshufhw to put the high elements in order, - // and update MaskVals with the new element order. - if (BestHiQuad >= 0) { - int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; - for (unsigned i = 4; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx < 0) { - InOrder.set(i); - } else if ((idx / 4) == BestHiQuad) { - MaskV[i] = (idx & 3) + 4; - InOrder.set(i); - } - } - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), - &MaskV[0]); - - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, - NewV.getOperand(0), - getShufflePSHUFHWImmediate(SVOp), DAG); - } - } - - // In case BestHi & BestLo were both -1, which means each quadword has a word - // from each of the four input quadwords, calculate the InOrder bitvector now - // before falling through to the insert/extract cleanup. - if (BestLoQuad == -1 && BestHiQuad == -1) { - NewV = V1; - for (int i = 0; i != 8; ++i) - if (MaskVals[i] < 0 || MaskVals[i] == i) - InOrder.set(i); - } - - // The other elements are put in the right place using pextrw and pinsrw. - for (unsigned i = 0; i != 8; ++i) { - if (InOrder[i]) - continue; - int EltIdx = MaskVals[i]; - if (EltIdx < 0) - continue; - SDValue ExtOp = (EltIdx < 8) ? - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getIntPtrConstant(EltIdx)) : - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, - DAG.getIntPtrConstant(EltIdx - 8)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, - DAG.getIntPtrConstant(i)); - } - return NewV; -} - -/// \brief v16i16 shuffles -/// -/// FIXME: We only support generation of a single pshufb currently. We can -/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as -/// well (e.g 2 x pshufb + 1 x por). -static SDValue -LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - - if (V2.getOpcode() != ISD::UNDEF) - return SDValue(); - - SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); - return getPSHUFB(MaskVals, V1, dl, DAG); -} - -// v16i8 shuffles - Prefer shuffles in the following order: -// 1. [ssse3] 1 x pshufb -// 2. [ssse3] 2 x pshufb + 1 x por -// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw -static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, - const X86Subtarget* Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - ArrayRef<int> MaskVals = SVOp->getMask(); - - // Promote splats to a larger type which usually leads to more efficient code. - // FIXME: Is this true if pshufb is available? - if (SVOp->isSplat()) - return PromoteSplat(SVOp, DAG); - - // If we have SSSE3, case 1 is generated when all result bytes come from - // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is - // present, fall back to case 3. - - // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (Subtarget->hasSSSE3()) { - SmallVector<SDValue,16> pshufbMask; - - // If all result elements are from one input vector, then only translate - // undef mask values to 0x80 (zero out result) in the pshufb mask. - // - // Otherwise, we have elements from both input vectors, and must zero out - // elements that come from V2 in the first mask, and V1 in the second mask - // so that we can OR them together. - for (unsigned i = 0; i != 16; ++i) { - int EltIdx = MaskVals[i]; - if (EltIdx < 0 || EltIdx >= 16) - EltIdx = 0x80; - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - } - V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, pshufbMask)); - - // As PSHUFB will zero elements with negative indices, it's safe to ignore - // the 2nd operand if it's undefined or zero. - if (V2.getOpcode() == ISD::UNDEF || - ISD::isBuildVectorAllZeros(V2.getNode())) - return V1; - - // Calculate the shuffle mask for the second input, shuffle it, and - // OR it with the first shuffled input. - pshufbMask.clear(); - for (unsigned i = 0; i != 16; ++i) { - int EltIdx = MaskVals[i]; - EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - } - V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, pshufbMask)); - return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - } - - // No SSSE3 - Calculate in place words and then fix all out of place words - // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from - // the 16 different words that comprise the two doublequadword input vectors. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); - SDValue NewV = V1; - for (int i = 0; i != 8; ++i) { - int Elt0 = MaskVals[i*2]; - int Elt1 = MaskVals[i*2+1]; - - // This word of the result is all undef, skip it. - if (Elt0 < 0 && Elt1 < 0) - continue; - - // This word of the result is already in the correct place, skip it. - if ((Elt0 == i*2) && (Elt1 == i*2+1)) - continue; - - SDValue Elt0Src = Elt0 < 16 ? V1 : V2; - SDValue Elt1Src = Elt1 < 16 ? V1 : V2; - SDValue InsElt; - - // If Elt0 and Elt1 are defined, are consecutive, and can be load - // using a single extract together, load it and store it. - if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, - DAG.getIntPtrConstant(Elt1 / 2)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, - DAG.getIntPtrConstant(i)); - continue; - } - - // If Elt1 is defined, extract it from the appropriate source. If the - // source byte is not also odd, shift the extracted word left 8 bits - // otherwise clear the bottom 8 bits if we need to do an or. - if (Elt1 >= 0) { - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, - DAG.getIntPtrConstant(Elt1 / 2)); - if ((Elt1 & 1) == 0) - InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, - DAG.getConstant(8, - TLI.getShiftAmountTy(InsElt.getValueType()))); - else if (Elt0 >= 0) - InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, - DAG.getConstant(0xFF00, MVT::i16)); - } - // If Elt0 is defined, extract it from the appropriate source. If the - // source byte is not also even, shift the extracted word right 8 bits. If - // Elt1 was also defined, OR the extracted values together before - // inserting them in the result. - if (Elt0 >= 0) { - SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, - Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); - if ((Elt0 & 1) != 0) - InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, - DAG.getConstant(8, - TLI.getShiftAmountTy(InsElt0.getValueType()))); - else if (Elt1 >= 0) - InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, - DAG.getConstant(0x00FF, MVT::i16)); - InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) - : InsElt0; - } - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, - DAG.getIntPtrConstant(i)); - } - return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); -} - -// v32i8 shuffles - Translate to VPSHUFB if possible. -static -SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); - - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - // VPSHUFB may be generated if - // (1) one of input vector is undefined or zeroinitializer. - // The mask value 0x80 puts 0 in the corresponding slot of the vector. - // And (2) the mask indexes don't cross the 128-bit lane. - if (VT != MVT::v32i8 || !Subtarget->hasInt256() || - (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) - return SDValue(); - - if (V1IsAllZero && !V2IsAllZero) { - CommuteVectorShuffleMask(MaskVals, 32); - V1 = V2; - } - return getPSHUFB(MaskVals, V1, dl, DAG); -} - -/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide -/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be -/// done when every pair / quad of shuffle mask elements point to elements in -/// the right sequence. e.g. -/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> -static -SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); - MVT NewVT; - unsigned Scale; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected!"); - case MVT::v2i64: - case MVT::v2f64: - return SDValue(SVOp, 0); - case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; - case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; - case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; - case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; - case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; - case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; - } - - SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i != NumElems; i += Scale) { - int StartIdx = -1; - for (unsigned j = 0; j != Scale; ++j) { - int EltIdx = SVOp->getMaskElt(i+j); - if (EltIdx < 0) - continue; - if (StartIdx < 0) - StartIdx = (EltIdx / Scale); - if (EltIdx != (int)(StartIdx*Scale + j)) - return SDValue(); - } - MaskVec.push_back(StartIdx); - } - - SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); - SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); - return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); -} - -/// getVZextMovL - Return a zero-extending vector move low node. -/// -static SDValue getVZextMovL(MVT VT, MVT OpVT, - SDValue SrcOp, SelectionDAG &DAG, - const X86Subtarget *Subtarget, SDLoc dl) { - if (VT == MVT::v2f64 || VT == MVT::v4f32) { - LoadSDNode *LD = nullptr; - if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) - LD = dyn_cast<LoadSDNode>(SrcOp); - if (!LD) { - // movssrr and movsdrr do not clear top bits. Try to use movd, movq - // instead. - MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; - if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && - SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && - SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && - SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { - // PR2108 - OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - OpVT, - SrcOp.getOperand(0) - .getOperand(0)))); - } - } - } - - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::BITCAST, dl, - OpVT, SrcOp))); -} - -/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles -/// which could not be matched by any known target speficic shuffle -static SDValue -LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - - SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); - if (NewOp.getNode()) - return NewOp; - - MVT VT = SVOp->getSimpleValueType(0); - - unsigned NumElems = VT.getVectorNumElements(); - unsigned NumLaneElems = NumElems / 2; - - SDLoc dl(SVOp); - MVT EltVT = VT.getVectorElementType(); - MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); - SDValue Output[2]; - - SmallVector<int, 16> Mask; - for (unsigned l = 0; l < 2; ++l) { - // Build a shuffle mask for the output, discovering on the fly which - // input vectors to use as shuffle operands (recorded in InputUsed). - // If building a suitable shuffle vector proves too hard, then bail - // out with UseBuildVector set. - bool UseBuildVector = false; - int InputUsed[2] = { -1, -1 }; // Not yet discovered. - unsigned LaneStart = l * NumLaneElems; - for (unsigned i = 0; i != NumLaneElems; ++i) { - // The mask element. This indexes into the input. - int Idx = SVOp->getMaskElt(i+LaneStart); - if (Idx < 0) { - // the mask element does not index into any input vector. - Mask.push_back(-1); - continue; - } - - // The input vector this mask element indexes into. - int Input = Idx / NumLaneElems; - - // Turn the index into an offset from the start of the input vector. - Idx -= Input * NumLaneElems; - - // Find or create a shuffle vector operand to hold this input. - unsigned OpNo; - for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { - if (InputUsed[OpNo] == Input) - // This input vector is already an operand. - break; - if (InputUsed[OpNo] < 0) { - // Create a new operand for this input vector. - InputUsed[OpNo] = Input; - break; - } - } - - if (OpNo >= array_lengthof(InputUsed)) { - // More than two input vectors used! Give up on trying to create a - // shuffle vector. Insert all elements into a BUILD_VECTOR instead. - UseBuildVector = true; - break; - } - - // Add the mask index for the new shuffle vector. - Mask.push_back(Idx + OpNo * NumLaneElems); - } - - if (UseBuildVector) { - SmallVector<SDValue, 16> SVOps; - for (unsigned i = 0; i != NumLaneElems; ++i) { - // The mask element. This indexes into the input. - int Idx = SVOp->getMaskElt(i+LaneStart); - if (Idx < 0) { - SVOps.push_back(DAG.getUNDEF(EltVT)); - continue; - } - - // The input vector this mask element indexes into. - int Input = Idx / NumElems; - - // Turn the index into an offset from the start of the input vector. - Idx -= Input * NumElems; - - // Extract the vector element by hand. - SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, - SVOp->getOperand(Input), - DAG.getIntPtrConstant(Idx))); - } - - // Construct the output using a BUILD_VECTOR. - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); - } else if (InputUsed[0] < 0) { - // No input vectors were used! The result is undefined. - Output[l] = DAG.getUNDEF(NVT); - } else { - SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), - (InputUsed[0] % 2) * NumLaneElems, - DAG, dl); - // If only one input was used, use an undefined vector for the other. - SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : - Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), - (InputUsed[1] % 2) * NumLaneElems, DAG, dl); - // At least one input vector was used. Create a new shuffle vector. - Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); - } - - Mask.clear(); - } - - // Concatenate the result back - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); -} - -/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with -/// 4 elements, and match them with several different shuffle types. -static SDValue -LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - MVT VT = SVOp->getSimpleValueType(0); - - assert(VT.is128BitVector() && "Unsupported vector size"); - - std::pair<int, int> Locs[4]; - int Mask1[] = { -1, -1, -1, -1 }; - SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); - - unsigned NumHi = 0; - unsigned NumLo = 0; - for (unsigned i = 0; i != 4; ++i) { - int Idx = PermMask[i]; - if (Idx < 0) { - Locs[i] = std::make_pair(-1, -1); - } else { - assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); - if (Idx < 4) { - Locs[i] = std::make_pair(0, NumLo); - Mask1[NumLo] = Idx; - NumLo++; - } else { - Locs[i] = std::make_pair(1, NumHi); - if (2+NumHi < 4) - Mask1[2+NumHi] = Idx; - NumHi++; - } - } - } - - if (NumLo <= 2 && NumHi <= 2) { - // If no more than two elements come from either vector. This can be - // implemented with two shuffles. First shuffle gather the elements. - // The second shuffle, which takes the first shuffle as both of its - // vector operands, put the elements into the right order. - V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - - int Mask2[] = { -1, -1, -1, -1 }; - - for (unsigned i = 0; i != 4; ++i) - if (Locs[i].first != -1) { - unsigned Idx = (i < 2) ? 0 : 4; - Idx += Locs[i].first * 2 + Locs[i].second; - Mask2[i] = Idx; - } - - return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); - } - - if (NumLo == 3 || NumHi == 3) { - // Otherwise, we must have three elements from one vector, call it X, and - // one element from the other, call it Y. First, use a shufps to build an - // intermediate vector with the one element from Y and the element from X - // that will be in the same half in the final destination (the indexes don't - // matter). Then, use a shufps to build the final vector, taking the half - // containing the element from Y from the intermediate, and the other half - // from X. - if (NumHi == 3) { - // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, 4); - std::swap(V1, V2); - } - - // Find the element from V2. - unsigned HiIndex; - for (HiIndex = 0; HiIndex < 3; ++HiIndex) { - int Val = PermMask[HiIndex]; - if (Val < 0) - continue; - if (Val >= 4) - break; - } - - Mask1[0] = PermMask[HiIndex]; - Mask1[1] = -1; - Mask1[2] = PermMask[HiIndex^1]; - Mask1[3] = -1; - V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - - if (HiIndex >= 2) { - Mask1[0] = PermMask[0]; - Mask1[1] = PermMask[1]; - Mask1[2] = HiIndex & 1 ? 6 : 4; - Mask1[3] = HiIndex & 1 ? 4 : 6; - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - } - - Mask1[0] = HiIndex & 1 ? 2 : 0; - Mask1[1] = HiIndex & 1 ? 0 : 2; - Mask1[2] = PermMask[2]; - Mask1[3] = PermMask[3]; - if (Mask1[2] >= 0) - Mask1[2] += 4; - if (Mask1[3] >= 0) - Mask1[3] += 4; - return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); - } - - // Break it into (shuffle shuffle_hi, shuffle_lo). - int LoMask[] = { -1, -1, -1, -1 }; - int HiMask[] = { -1, -1, -1, -1 }; - - int *MaskPtr = LoMask; - unsigned MaskIdx = 0; - unsigned LoIdx = 0; - unsigned HiIdx = 2; - for (unsigned i = 0; i != 4; ++i) { - if (i == 2) { - MaskPtr = HiMask; - MaskIdx = 1; - LoIdx = 0; - HiIdx = 2; - } - int Idx = PermMask[i]; - if (Idx < 0) { - Locs[i] = std::make_pair(-1, -1); - } else if (Idx < 4) { - Locs[i] = std::make_pair(MaskIdx, LoIdx); - MaskPtr[LoIdx] = Idx; - LoIdx++; - } else { - Locs[i] = std::make_pair(MaskIdx, HiIdx); - MaskPtr[HiIdx] = Idx; - HiIdx++; - } - } - - SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); - SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); - int MaskOps[] = { -1, -1, -1, -1 }; - for (unsigned i = 0; i != 4; ++i) - if (Locs[i].first != -1) - MaskOps[i] = Locs[i].first * 4 + Locs[i].second; - return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); -} - -static bool MayFoldVectorLoad(SDValue V) { - while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) - V = V.getOperand(0); - if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && - V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) - // BUILD_VECTOR (load), undef - V = V.getOperand(0); - - return MayFoldLoad(V); -} - -static -SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - - // Canonizalize to v2f64. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); - return DAG.getNode(ISD::BITCAST, dl, VT, - getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, - V1, DAG)); -} - -static -SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, - bool HasSSE2) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - - assert(VT != MVT::v2i64 && "unsupported shuffle type"); - - if (HasSSE2 && VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); - - // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) - return DAG.getNode(ISD::BITCAST, dl, VT, - getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); -} - -static -SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - - assert((VT == MVT::v4i32 || VT == MVT::v4f32) && - "unsupported shuffle type"); - - if (V2.getOpcode() == ISD::UNDEF) - V2 = V1; - - // v4i32 or v4f32 - return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); -} - -static -SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - unsigned NumElems = VT.getVectorNumElements(); - - // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second - // operand of these instructions is only memory, so check if there's a - // potencial load folding here, otherwise use SHUFPS or MOVSD to match the - // same masks. - bool CanFoldLoad = false; - - // Trivial case, when V2 comes from a load. - if (MayFoldVectorLoad(V2)) - CanFoldLoad = true; - - // When V1 is a load, it can be folded later into a store in isel, example: - // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) - // turns into: - // (MOVLPSmr addr:$src1, VR128:$src2) - // So, recognize this potential and also use MOVLPS or MOVLPD - else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) - CanFoldLoad = true; - - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - if (CanFoldLoad) { - if (HasSSE2 && NumElems == 2) - return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); - - if (NumElems == 4) - // If we don't care about the second element, proceed to use movss. - if (SVOp->getMaskElt(1) != -1) - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); - } - - // movl and movlp will both match v2i64, but v2i64 is never matched by - // movl earlier because we make it strict to avoid messing with the movlp load - // folding logic (see the code above getMOVLP call). Match it here then, - // this is horrible, but will stay like this until we move all shuffle - // matching to x86 specific nodes. Note that for the 1st condition all - // types are matched with movsd. - if (HasSSE2) { - // FIXME: isMOVLMask should be checked and matched before getMOVLP, - // as to remove this logic from here, as much as possible - if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); - } - - assert(VT != MVT::v4i32 && "unsupported shuffle type"); - - // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, - getShuffleSHUFImmediate(SVOp), DAG); -} - -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -// It is only safe to call this function if isINSERTPSMask is true for -// this shufflevector mask. -static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, - SelectionDAG &DAG) { - // Generate an insertps instruction when inserting an f32 from memory onto a - // v4f32 or when copying a member from one v4f32 to another. - // We also use it for transferring i32 from one register to another, - // since it simply copies the same bits. - // If we're transferring an i32 from memory to a specific element in a - // register, we output a generic DAG that will match the PINSRD - // instruction. - MVT VT = SVOp->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - auto Mask = SVOp->getMask(); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "unsupported vector type for insertps/pinsrd"); - - auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; - auto FromV2Predicate = [](const int &i) { return i >= 4; }; - int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); - - SDValue From; - SDValue To; - unsigned DestIndex; - if (FromV1 == 1) { - From = V1; - To = V2; - DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - - Mask.begin(); - - // If we have 1 element from each vector, we have to check if we're - // changing V1's element's place. If so, we're done. Otherwise, we - // should assume we're changing V2's element's place and behave - // accordingly. - int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); - assert(DestIndex <= INT32_MAX && "truncated destination index"); - if (FromV1 == FromV2 && - static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { - From = V2; - To = V1; - DestIndex = - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); - } - } else { - assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && - "More than one element from V1 and from V2, or no elements from one " - "of the vectors. This case should not have returned true from " - "isINSERTPSMask"); - From = V2; - To = V1; - DestIndex = - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); - } - - // Get an index into the source vector in the range [0,4) (the mask is - // in the range [0,8) because it can address V1 and V2) - unsigned SrcIndex = Mask[DestIndex] % 4; - if (MayFoldLoad(From)) { - // Trivial case, when From comes from a load and is only used by the - // shuffle. Make it use insertps from the vector that we need from that - // load. - SDValue NewLoad = - NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG); - if (!NewLoad.getNode()) - return SDValue(); - - if (EVT == MVT::f32) { - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, - InsertpsMask); - } else { // EVT == MVT::i32 - // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT - // instruction, to match the PINSRD instruction, which loads an i32 to a - // certain vector element. - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, - DAG.getConstant(DestIndex, MVT::i32)); - } - } - - // Vector-element-to-vector - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); -} - -// Reduce a vector shuffle to zext. -static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - // PMOVZX is only available from SSE41. - if (!Subtarget->hasSSE41()) - return SDValue(); - - MVT VT = Op.getSimpleValueType(); - - // Only AVX2 support 256-bit vector integer extending. - if (!Subtarget->hasInt256() && VT.is256BitVector()) - return SDValue(); - - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDLoc DL(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - unsigned NumElems = VT.getVectorNumElements(); - - // Extending is an unary operation and the element type of the source vector - // won't be equal to or larger than i64. - if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || - VT.getVectorElementType() == MVT::i64) - return SDValue(); - - // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. - unsigned Shift = 1; // Start from 2, i.e. 1 << 1. - while ((1U << Shift) < NumElems) { - if (SVOp->getMaskElt(1U << Shift) == 1) - break; - Shift += 1; - // The maximal ratio is 8, i.e. from i8 to i64. - if (Shift > 3) - return SDValue(); - } - - // Check the shuffle mask. - unsigned Mask = (1U << Shift) - 1; - for (unsigned i = 0; i != NumElems; ++i) { - int EltIdx = SVOp->getMaskElt(i); - if ((i & Mask) != 0 && EltIdx != -1) - return SDValue(); - if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) - return SDValue(); - } - - unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - MVT NeVT = MVT::getIntegerVT(NBits); - MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); - - if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) - return SDValue(); - - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); -} - -static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - MVT VT = Op.getSimpleValueType(); - SDLoc dl(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - - if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget, DAG, dl); - - // Handle splat operations - if (SVOp->isSplat()) { - // Use vbroadcast whenever the splat comes from a foldable load - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); - if (Broadcast.getNode()) - return Broadcast; - } - - // Check integer expanding shuffles. - SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - - // If the shuffle can be profitably rewritten as a narrower shuffle, then - // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || - VT == MVT::v32i8) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) - return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); - } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { - // FIXME: Figure out a cleaner way to do this. - if (ISD::isBuildVectorAllZeros(V2.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) { - MVT NewVT = NewOp.getSimpleValueType(); - if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), - NewVT, true, false)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, - dl); - } - } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) { - MVT NewVT = NewOp.getSimpleValueType(); - if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, - dl); - } - } - } - return SDValue(); -} - -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - SDLoc dl(Op); - unsigned NumElems = VT.getVectorNumElements(); - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - bool V1IsSplat = false; - bool V2IsSplat = false; - bool HasSSE2 = Subtarget->hasSSE2(); - bool HasFp256 = Subtarget->hasFp256(); - bool HasInt256 = Subtarget->hasInt256(); - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - - // Check if we should use the experimental vector shuffle lowering. If so, - // delegate completely to that code path. - if (ExperimentalVectorShuffleLowering) - return lowerVectorShuffle(Op, Subtarget, DAG); - - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); - - if (V1IsUndef && V2IsUndef) - return DAG.getUNDEF(VT); - - // When we create a shuffle node we put the UNDEF node to second operand, - // but in some cases the first operand may be transformed to UNDEF. - // In this case we should just commute the node. - if (V1IsUndef) - return DAG.getCommutedVectorShuffle(*SVOp); - - // Vector shuffle lowering takes 3 steps: - // - // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable - // narrowing and commutation of operands should be handled. - // 2) Matching of shuffles with known shuffle masks to x86 target specific - // shuffle nodes. - // 3) Rewriting of unmatched masks into new generic shuffle operations, - // so the shuffle can be broken into other shuffles and the legalizer can - // try the lowering again. - // - // The general idea is that no vector_shuffle operation should be left to - // be matched during isel, all of them must be converted to a target specific - // node here. - - // Normalize the input vectors. Here splats, zeroed vectors, profitable - // narrowing and commutation of operands should be handled. The actual code - // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - - SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); - - // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and - // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && - V2IsUndef && MayFoldVectorLoad(V1)) - return getMOVDDup(Op, dl, V1, DAG); - - if (isMOVHLPS_v_undef_Mask(M, VT)) - return getMOVHighToLow(Op, dl, DAG); - - // Use to match splats - if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && - (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - if (isPSHUFDMask(M, VT)) { - // The actual implementation will match the mask in the if above and then - // during isel it can match several different instructions, not only pshufd - // as its name says, sad but true, emulate the behavior for now... - if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); - - unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - - if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - - if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, - DAG); - - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, - TargetMask, DAG); - } - - if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, - getShufflePALIGNRImmediate(SVOp), - DAG); - - if (isVALIGNMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, - getShuffleVALIGNImmediate(SVOp), - DAG); - - // Check if this can be converted into a logical shift. - bool isLeft = false; - unsigned ShAmt = 0; - SDValue ShVal; - bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); - if (isShift && ShVal.hasOneUse()) { - // If the shifted value has multiple uses, it may be cheaper to use - // v_set0 + movlhps or movhlps, etc. - MVT EltVT = VT.getVectorElementType(); - ShAmt *= EltVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); - } - - if (isMOVLMask(M, VT)) { - if (ISD::isBuildVectorAllZeros(V1.getNode())) - return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!isMOVLPMask(M, VT)) { - if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); - - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); - } - } - - // FIXME: fold these into legal mask. - if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) - return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - - if (isMOVHLPSMask(M, VT)) - return getMOVHighToLow(Op, dl, DAG); - - if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - - if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - - if (isMOVLPMask(M, VT)) - return getMOVLP(Op, dl, DAG, HasSSE2); - - if (ShouldXformToMOVHLPS(M, VT) || - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return DAG.getCommutedVectorShuffle(*SVOp); - - if (isShift) { - // No better options. Use a vshldq / vsrldq. - MVT EltVT = VT.getVectorElementType(); - ShAmt *= EltVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); - } - - bool Commuted = false; - // FIXME: This should also accept a bitcast of a splat? Be careful, not - // 1,1,1,1 -> v8i16 though. - BitVector UndefElements; - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode())) - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) - V1IsSplat = true; - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode())) - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) - V2IsSplat = true; - - // Canonicalize the splat or undef, if present, to be on the RHS. - if (!V2IsUndef && V1IsSplat && !V2IsSplat) { - CommuteVectorShuffleMask(M, NumElems); - std::swap(V1, V2); - std::swap(V1IsSplat, V2IsSplat); - Commuted = true; - } - - if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { - // Shuffling low element of v1 into undef, just return v1. - if (V2IsUndef) - return V1; - // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which - // the instruction selector will not match, so get a canonical MOVL with - // swapped operands to undo the commute. - return getMOVL(DAG, dl, VT, V2, V1); - } - - if (isUNPCKLMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - - if (isUNPCKHMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - - if (V2IsSplat) { - // Normalize mask so all entries that point to V2 points to its first - // element then try to match unpck{h|l} again. If match, return a - // new vector_shuffle with the corrected mask.p - SmallVector<int, 8> NewMask(M.begin(), M.end()); - NormalizeMask(NewMask, NumElems); - if (isUNPCKLMask(NewMask, VT, HasInt256, true)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (isUNPCKHMask(NewMask, VT, HasInt256, true)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } - - if (Commuted) { - // Commute is back and try unpck* again. - // FIXME: this seems wrong. - CommuteVectorShuffleMask(M, NumElems); - std::swap(V1, V2); - std::swap(V1IsSplat, V2IsSplat); - - if (isUNPCKLMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - - if (isUNPCKHMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } - - // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return DAG.getCommutedVectorShuffle(*SVOp); - - // The checks below are all present in isShuffleMaskLegal, but they are - // inlined here right now to enable us to directly emit target specific - // nodes, and remove one by one until they don't return Op anymore. - - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && - SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - } - - if (isPSHUFHWMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, - getShufflePSHUFHWImmediate(SVOp), - DAG); - - if (isPSHUFLWMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, - getShufflePSHUFLWImmediate(SVOp), - DAG); - - unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); - - if (isSHUFPMask(M, VT)) - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, - getShuffleSHUFImmediate(SVOp), DAG); - - if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - //===--------------------------------------------------------------------===// - // Generate target specific nodes for 128 or 256-bit shuffles only - // supported in the AVX instruction set. - // - - // Handle VMOVDDUPY permutations - if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) - return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - - // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT)) { - if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, - getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, - getShuffleSHUFImmediate(SVOp), DAG); - } - - unsigned Idx; - if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) - return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), - Idx*(NumElems/2), DAG, dl); - - // Handle VPERM2F128/VPERM2I128 permutations - if (isVPERM2X128Mask(M, VT, HasFp256)) - return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, - V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) - return getINSERTPS(SVOp, dl, DAG); - - unsigned Imm8; - if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); - - if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || - VT.is512BitVector()) { - MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); - MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); - SmallVector<SDValue, 16> permclMask; - for (unsigned i = 0; i != NumElems; ++i) { - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); - } - - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); - if (V2IsUndef) - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 - return DAG.getNode(X86ISD::VPERMV, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2); - } - - //===--------------------------------------------------------------------===// - // Since no target specific shuffle was selected for this generic one, - // lower it into other known shuffles. FIXME: this isn't true yet, but - // this is the plan. - // - - // Handle v8i16 specifically since SSE can do byte extraction and insertion. - if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v16i16 && Subtarget->hasInt256()) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v16i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v32i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - // Handle all 128-bit wide vectors with 4 elements, and match them with - // several different shuffle types. - if (NumElems == 4 && VT.is128BitVector()) - return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); - - // Handle general 256-bit shuffles - if (VT.is256BitVector()) - return LowerVECTOR_SHUFFLE_256(SVOp, DAG); - - return SDValue(); -} - // This function assumes its argument is a BUILD_VECTOR of constants or // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is // true. @@ -12344,48 +10083,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to a vector shuffle. +static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); + auto *CondBV = cast<BuildVectorSDNode>(Cond); - // Check the mask for BLEND and build the value. - unsigned MaskValue = 0; - if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) - return SDValue(); - - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); - RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + // Only non-legal VSELECTs reach this lowering, convert those into generic + // shuffles and re-use the shuffle lowering path for blends. + SmallVector<int, 32> Mask; + for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { + SDValue CondElt = CondBV->getOperand(i); + Mask.push_back( + isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); + return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -12396,28 +10116,41 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); + // Try to lower this to a blend-style vector shuffle. This can handle all + // constant condition cases. + SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; - // Some types for vselect were previously set to Expand, not Legal or - // Custom. Return an empty SDValue so we fall-through to Expand, after - // the Custom lowering phase. - MVT VT = Op.getSimpleValueType(); - switch (VT.SimpleTy) { + // Variable blends are only legal from SSE4.1 onward. + if (!Subtarget->hasSSE41()) + return SDValue(); + + // Only some types will be legal on some subtargets. If we can emit a legal + // VSELECT-matching blend, return Op, and but if we need to expand, return + // a null value. + switch (Op.getSimpleValueType().SimpleTy) { default: - break; + // Most of the vector types have blends past SSE4.1. + return Op; + + case MVT::v32i8: + // The byte blends for AVX vectors were introduced only in AVX2. + if (Subtarget->hasAVX2()) + return Op; + + return SDValue(); + case MVT::v8i16: case MVT::v16i16: + // AVX-512 BWI and VLX features support VSELECT with i16 elements. if (Subtarget->hasBWI() && Subtarget->hasVLX()) - break; + return Op; + + // FIXME: We should custom lower this by fixing the condition and using i8 + // blends. return SDValue(); } - - // We couldn't create a "Blend with immediate" node. - // This node should still be legal, but we'll have to emit a blendv* - // instruction. - return Op; } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { @@ -12493,6 +10226,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && + "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512 @@ -12506,6 +10241,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); const TargetRegisterClass* rc = getRegClassFor(VecVT); + if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) + rc = getRegClassFor(MVT::v16i1); unsigned MaxSift = rc->getSize()*8 - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, MVT::i8)); @@ -12631,7 +10368,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. -SDValue +SDValue X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); @@ -12644,7 +10381,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { // insert element and then truncate the result. MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); - SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); @@ -12815,27 +10552,47 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - SDLoc dl(Op.getNode()); - SDValue Vec = Op.getNode()->getOperand(0); - SDValue SubVec = Op.getNode()->getOperand(1); - SDValue Idx = Op.getNode()->getOperand(2); - - if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || - Op.getNode()->getSimpleValueType(0).is512BitVector()) && - SubVec.getNode()->getSimpleValueType(0).is128BitVector() && - isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - } + if (!Subtarget->hasAVX()) + return SDValue(); - if (Op.getNode()->getSimpleValueType(0).is512BitVector() && - SubVec.getNode()->getSimpleValueType(0).is256BitVector() && - isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // Fold two 16-byte subvector loads into one 32-byte load: + // (insert_subvector (insert_subvector undef, (load addr), 0), + // (load addr + 16), Elts/2) + // --> load32 addr + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.is256BitVector() && SubVecVT.is128BitVector() && + !Subtarget->isUnalignedMem32Slow()) { + SDValue SubVec2 = Vec.getOperand(1); + if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { + if (Idx2->getZExtValue() == 0) { + SDValue Ops[] = { SubVec2, SubVec }; + SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); + if (LD.getNode()) + return LD; + } } } + + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && + SubVecVT.is128BitVector()) + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + + if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + return SDValue(); } @@ -13392,7 +11149,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, } return SDValue(); } - + assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); @@ -14039,7 +11796,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; } - + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, getPointerTy()); @@ -14233,7 +11990,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } - + unsigned EltBits = EltVT.getSizeInBits(); LLVMContext *Context = DAG.getContext(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... @@ -14260,7 +12017,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } - + // If not vector, then scalar. unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; @@ -14290,19 +12047,17 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. - // First get the sign bit of second operand. - SmallVector<Constant*,4> CV; - if (SrcVT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); - } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - } + const fltSemantics &Sem = + VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + const unsigned SizeInBits = VT.getSizeInBits(); + + SmallVector<Constant *, 4> CV( + VT == MVT::f64 ? 2 : 4, + ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); + + // First, clear all bits but the sign bit from the second operand (sign). + CV[0] = ConstantFP::get(*Context, + APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, @@ -14310,40 +12065,30 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); - // Shift sign bit right or left if the two operands have different types. - if (SrcVT.bitsGT(VT)) { - // Op0 is MVT::f32, Op1 is MVT::f64. - SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); - SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, - DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); - SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, - DAG.getIntPtrConstant(0)); - } - - // Clear first operand sign bit. - CV.clear(); - if (VT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(64, ~(1ULL << 63))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); + // Next, clear the sign bit from the first operand (magnitude). + // If it's a constant, we can clear it here. + if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { + APFloat APF = Op0CN->getValueAPF(); + // If the magnitude is a positive zero, the sign bit alone is enough. + if (APF.isPosZero()) + return SignBit; + APF.clearSign(); + CV[0] = ConstantFP::get(*Context, APF); } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(32, ~(1U << 31))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV[0] = ConstantFP::get( + *Context, + APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); - SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); - - // Or the value with the sign bit. + SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + // If the magnitude operand wasn't a constant, we need to AND out the sign. + if (!isa<ConstantFPSDNode>(Op0)) + Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); + + // OR the magnitude value with the sign bit. return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } @@ -14473,11 +12218,11 @@ static bool hasNonFlagsUse(SDValue Op) { /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - // KORTEST instruction should be selected - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); - + if (Op.getValueType() == MVT::i1) { + SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, + DAG.getConstant(0, MVT::i8)); + } // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; @@ -14697,9 +12442,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, DAG.getConstant(0, Op.getValueType())); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SmallVector<SDValue, 4> Ops; - for (unsigned i = 0; i != NumOperands; ++i) - Ops.push_back(Op.getOperand(i)); + SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); @@ -14717,16 +12460,16 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); } - + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Do the comparison at i32 if it's smaller, besides the Atom case. - // This avoids subregister aliasing issues. Keep the smaller reference - // if we're optimizing for size, however, as that'll allow better folding + // Do the comparison at i32 if it's smaller, besides the Atom case. + // This avoids subregister aliasing issues. Keep the smaller reference + // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::MinSize) && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -14780,7 +12523,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, return SDValue(); EVT VT = Op.getValueType(); - + // SSE1 has rsqrtss and rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -14808,9 +12551,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // significant digits in the divisor. if (!Subtarget->useReciprocalEst()) return SDValue(); - + EVT VT = Op.getValueType(); - + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -15307,8 +13050,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { cast<ConstantSDNode>(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) + if (NewSetCC.getNode()) { + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; + } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of @@ -15629,11 +13375,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget ((Subtarget->hasDQI() && Subtarget->hasVLX() && VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || - + ((Subtarget->hasDQI() && VT.is512BitVector() && VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); - + unsigned int NumElts = VT.getVectorNumElements(); if (NumElts != 8 && NumElts != 16) @@ -15718,6 +13464,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, @@ -15797,9 +13544,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // Attempt to load the original value using scalar loads. // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; + for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; } @@ -16232,7 +13977,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || SplitStack; SDLoc dl(Op); @@ -16258,7 +14003,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -16316,8 +14061,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -16427,21 +14171,16 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!DAG.getTarget().Options.UseSoftFloat && - !(DAG.getMachineFunction() - .getFunction()->getAttributes() - .hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) && + !(DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain - SmallVector<SDValue, 11> InstOps; - InstOps.push_back(Chain); - InstOps.push_back(SrcPtr); - InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); - InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); - InstOps.push_back(DAG.getConstant(Align, MVT::i32)); + SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, MVT::i32), + DAG.getConstant(ArgMode, MVT::i8), + DAG.getConstant(Align, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, @@ -16558,7 +14297,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { - assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + MVT SVT = ShAmt.getSimpleValueType(); + assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) @@ -16573,13 +14313,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } - // Need to build a vector containing shift amount - // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { + // Let the shuffle legalizer expand this shift amount node. + SDValue Op0 = ShAmt.getOperand(0); + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); + ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); + } else { + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + SmallVector<SDValue, 4> ShOps; + ShOps.push_back(ShAmt); + if (SVT == MVT::i32) { + ShOps.push_back(DAG.getConstant(0, SVT)); + ShOps.push_back(DAG.getUNDEF(SVT)); + } + ShOps.push_back(DAG.getUNDEF(SVT)); + + MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); + } // The return type has to be a 128-bit type with the same element // type as the input type. @@ -16628,52 +14383,28 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); } -static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_mask_vfmadd_ps_512: - case Intrinsic::x86_fma_mask_vfmadd_pd_512: - return X86ISD::FMADD; - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_mask_vfmsub_ps_512: - case Intrinsic::x86_fma_mask_vfmsub_pd_512: - return X86ISD::FMSUB; - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_mask_vfnmadd_ps_512: - case Intrinsic::x86_fma_mask_vfnmadd_pd_512: - return X86ISD::FNMADD; - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_mask_vfnmsub_ps_512: - case Intrinsic::x86_fma_mask_vfnmsub_pd_512: - return X86ISD::FNMSUB; - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: - case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: - return X86ISD::FMADDSUB; - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: - case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: - return X86ISD::FMSUBADD; - } +/// \brief Creates an SDNode for a predicated scalar operation. +/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). +/// The mask is comming as MVT::i8 and it should be truncated +/// to MVT::i1 while lowering masking intrinsics. +/// The main difference between ScalarMaskingNode and VectorMaskingNode is using +/// "X86select" instead of "vselect". We just can't create the "vselect" node for +/// a scalar instruction. +static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (isAllOnes(Mask)) + return Op; + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16701,7 +14432,73 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget RoundingMode), Mask, Src0, Subtarget, DAG); } - + case INTR_TYPE_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src0 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // There are 2 kinds of intrinsics in this group: + // (1) With supress-all-exceptions (sae) - 6 operands + // (2) With rounding mode and sae - 7 operands. + if (Op.getNumOperands() == 6) { + SDValue Sae = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + Sae), + Mask, Src0, Subtarget, DAG); + } + assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); + SDValue RoundingMode = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + RoundingMode, Sae), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_2OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1,Src2), + Mask, PassThru, Subtarget, DAG); + } + case FMA_OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, Src1, Subtarget, DAG); + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src1, Src2, Src3), + Mask, Src1, Subtarget, DAG); + } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -16751,9 +14548,45 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); case VSHIFT_MASK: - return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG), - Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);; + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), + Op.getOperand(1), + Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, + DAG); + case COMPRESS_EXPAND_IN_REG: { + SDValue Mask = Op.getOperand(3); + SDValue DataToCompress = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + if (isAllOnes(Mask)) // return data as is + return Op.getOperand(1); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, + PassThru); + } + case BLEND: { + SDValue Mask = Op.getOperand(3); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), + Op.getOperand(2)); + } default: break; } @@ -16762,138 +14595,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - // Arithmetic intrinsics. - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_avx2_pmulu_dq: - return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulhu_w: - case Intrinsic::x86_avx2_pmulhu_w: - return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulh_w: - case Intrinsic::x86_avx2_pmulh_w: - return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE/SSE2/AVX floating point max/min intrinsics. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - Opcode = X86ISD::FMAX; - break; - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: - Opcode = X86ISD::FMIN; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // AVX2 variable shift intrinsics - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - Opcode = ISD::SHL; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - Opcode = ISD::SRL; - break; - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - Opcode = ISD::SRA; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx2_packusdw: - return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshuf_d: - return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufl_w: - return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufh_w: - return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_psign_b_128: - case Intrinsic::x86_ssse3_psign_w_128: - case Intrinsic::x86_ssse3_psign_d_128: - case Intrinsic::x86_avx2_psign_b: - case Intrinsic::x86_avx2_psign_w: - case Intrinsic::x86_avx2_psign_d: - return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - // Operands intentionally swapped. Mask is last operand to intrinsic, - // but second operand for node/instruction. - return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(1)); - case Intrinsic::x86_avx512_mask_valign_q_512: case Intrinsic::x86_avx512_mask_valign_d_512: // Vector source operands are swapped. @@ -17056,58 +14757,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } - - case Intrinsic::x86_fma_mask_vfmadd_ps_512: - case Intrinsic::x86_fma_mask_vfmadd_pd_512: - case Intrinsic::x86_fma_mask_vfmsub_ps_512: - case Intrinsic::x86_fma_mask_vfmsub_pd_512: - case Intrinsic::x86_fma_mask_vfnmadd_ps_512: - case Intrinsic::x86_fma_mask_vfnmadd_pd_512: - case Intrinsic::x86_fma_mask_vfnmsub_ps_512: - case Intrinsic::x86_fma_mask_vfnmsub_pd_512: - case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: - case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: - case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: - case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { - auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); - if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) - return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), - dl, Op.getValueType(), - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)), - Op.getOperand(4), Op.getOperand(1), - Subtarget, DAG); - else - return SDValue(); - } - - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -17305,7 +14954,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); - break; + break; case RDSEED: case RDRAND: { // Emit the node with the right value type. @@ -17403,6 +15052,58 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Results.push_back(Store); return DAG.getMergeValues(Results, dl); } + case COMPRESS_TO_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToCompress = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + if (isAllOnes(Mask)) // return just a store + return DAG.getStore(Chain, dl, DataToCompress, Addr, + MachinePointerInfo(), false, false, 0); + + EVT VT = DataToCompress.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, + DataToCompress, DAG.getUNDEF(VT)); + return DAG.getStore(Chain, dl, Compressed, Addr, + MachinePointerInfo(), false, false, 0); + } + case EXPAND_FROM_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue PathThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + EVT VT = Op.getValueType(); + + if (isAllOnes(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, + false, 0); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), + false, false, false, 0); + + SDValue Results[] = { + DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru), + Chain}; + return DAG.getMergeValues(Results, dl); + } } } @@ -17420,8 +15121,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -17436,15 +15136,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + EVT VT = Op.getValueType(); + MFI->setFrameAddressIsTaken(true); - EVT VT = Op.getValueType(); + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { + // Depth > 0 makes no sense on targets which use Windows unwind codes. It + // is not possible to crawl up the stack without looking at the unwind codes + // simultaneously. + int FrameAddrIndex = FuncInfo->getFAIndex(); + if (!FrameAddrIndex) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( + SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false); + FuncInfo->setFAIndex(FrameAddrIndex); + } + return DAG.getFrameIndex(FrameAddrIndex, VT); + } + + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); - unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -17471,8 +15189,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -17483,8 +15200,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -17531,7 +15247,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -17694,8 +15410,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, */ MachineFunction &MF = DAG.getMachineFunction(); - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -18090,76 +15805,29 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG); } - if (VT == MVT::v16i8) { - if (Op.getOpcode() == ISD::SHL) { - // Make a large shift. - SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, - MVT::v8i16, R, ShiftAmt, - DAG); - SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); - // Zero out the rightmost bits. - SmallVector<SDValue, 16> V(16, - DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); - } - if (Op.getOpcode() == ISD::SRL) { - // Make a large shift. - SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, - MVT::v8i16, R, ShiftAmt, - DAG); - SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); - // Zero out the leftmost bits. - SmallVector<SDValue, 16> V(16, - DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); - } - if (Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); - } - - // R s>> a === ((R u>> a) ^ m) - m - SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); - Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); - Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); - return Res; - } - llvm_unreachable("Unknown shift opcode."); - } + if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + unsigned NumElts = VT.getVectorNumElements(); + MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, - MVT::v16i16, R, ShiftAmt, - DAG); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, + R, ShiftAmt, DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V(32, - DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); + SmallVector<SDValue, 32> V( + NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, - MVT::v16i16, R, ShiftAmt, - DAG); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, + R, ShiftAmt, DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V(32, - DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); + SmallVector<SDValue, 32> V( + NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } @@ -18172,8 +15840,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // R s>> a === ((R u>> a) ^ m) - m SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, - MVT::i8)); + SmallVector<SDValue, 32> V(NumElts, + DAG.getConstant(128 >> ShiftAmt, MVT::i8)); SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); @@ -18249,55 +15917,43 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned i, j; - for (i = 0; i != NumElts; ++i) { - if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) - continue; - break; - } - for (j = i; j != NumElts; ++j) { - SDValue Arg = Amt.getOperand(j); - if (Arg.getOpcode() == ISD::UNDEF) continue; - if (Arg != Amt.getOperand(i)) - break; - } - if (i != NumElts && j == NumElts) - BaseShAmt = Amt.getOperand(i); + if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { + // Check if this build_vector node is doing a splat. + // If so, then set BaseShAmt equal to the splat value. + BaseShAmt = BV->getSplatValue(); + if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) + BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); - if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && - cast<ShuffleVectorSDNode>(Amt)->isSplat()) { + + ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); + if (SVN && SVN->isSplat()) { + unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = InVec.getValueType().getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = InVec.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } + assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + "Unexpected shuffle index found!"); + BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { - unsigned SplatIdx = - cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } - if (!BaseShAmt.getNode()) - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, - DAG.getIntPtrConstant(0)); + + if (!BaseShAmt) + // Avoid introducing an extract element from a shuffle. + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, + DAG.getIntPtrConstant(SplatIdx)); } } if (BaseShAmt.getNode()) { - if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); + if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); @@ -18415,7 +16071,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. - if (Op.getOpcode() == ISD::SHL && + if (Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { @@ -18507,15 +16163,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, CanBeSimplified = Amt2 == Amt->getOperand(j); } } - + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. EVT CastVT = MVT::v4i32; - SDValue Splat1 = + SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); - SDValue Splat2 = + SDValue Splat2 = DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) @@ -18704,81 +16360,17 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -// Sign extension of the low part of vector elements. This may be used either -// when sign extend instructions are not available or if the vector element -// sizes already match the sign-extended size. If the vector elements are in -// their pre-extended size and sign extend instructions are available, that will -// be handled by LowerSIGN_EXTEND. -SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); - MVT VT = Op.getSimpleValueType(); - - if (!Subtarget->hasSSE2() || !VT.isVector()) - return SDValue(); - - unsigned BitsDiff = VT.getScalarType().getSizeInBits() - - ExtraVT.getScalarType().getSizeInBits(); - - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v8i32: - case MVT::v16i16: - if (!Subtarget->hasFp256()) - return SDValue(); - if (!Subtarget->hasInt256()) { - // needs to be split - unsigned NumElems = VT.getVectorNumElements(); - - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); - - MVT EltVT = VT.getVectorElementType(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); - ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, - ExtraNumElems/2); - SDValue Extra = DAG.getValueType(ExtraVT); - - LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); - LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); - } - // fall through - case MVT::v4i32: - case MVT::v8i16: { - SDValue Op0 = Op.getOperand(0); - - // This is a sign extension of some low part of vector elements without - // changing the size of the vector elements themselves: - // Shift-Left + Shift-Right-Algebraic. - SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, - BitsDiff, DAG); - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, - DAG); - } - } -} - /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) - return Subtarget.hasCmpxchg16b(); + return Subtarget->hasCmpxchg16b(); else return false; } @@ -18795,9 +16387,7 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { } bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available @@ -18840,9 +16430,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually @@ -18878,7 +16466,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; - } else if (hasMFENCE(Subtarget)) { + } else if (hasMFENCE(*Subtarget)) { Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence); @@ -18997,9 +16585,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, DAG.getIntPtrConstant(i))); // Explicitly mark the extra elements as Undef. - SDValue Undef = DAG.getUNDEF(SVT); - for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) - Elts.push_back(Undef); + Elts.append(NumElts, DAG.getUNDEF(SVT)); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); @@ -19025,6 +16611,139 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + + Op = Op.getOperand(0); + EVT VT = Op.getValueType(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "CTPOP lowering only implemented for 128/256-bit wide vector types"); + + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + unsigned Len = EltVT.getSizeInBits(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: + // + // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled + // v8i32 => Always profitable + // + // FIXME: There a couple of possible improvements: + // + // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). + // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // + assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && + "CTPOP not implemented for this vector element type."); + + // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid + // extra legalization. + bool NeedsBitcast = EltVT == MVT::i32; + MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + + SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); + SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); + SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); + + // v = v - ((v >> 1) & 0x55555555...) + SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT)); + SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); + SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); + if (NeedsBitcast) + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + + SmallVector<SDValue, 8> Mask55(NumElts, Cst55); + SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); + if (NeedsBitcast) + M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); + + SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SmallVector<SDValue, 8> Mask33(NumElts, Cst33); + SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); + SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT)); + SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); + + Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); + if (NeedsBitcast) { + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); + Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); + } + + SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); + SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); + if (VT != AndRHS.getValueType()) { + AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); + AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); + } + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT)); + SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + + SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); + } + And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + // The algorithm mentioned above uses: + // v = (v * 0x01010101...) >> (Len - 8) + // + // Change it to use vector adds + vector shifts which yield faster results on + // Haswell than using vector integer multiplication. + // + // For i32 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // + // For i64 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // v = v + (v >> 32) + // + Add = And; + SmallVector<SDValue, 8> Csts; + for (unsigned i = 8; i <= Len/2; i *= 2) { + Csts.assign(NumElts, DAG.getConstant(i, EltVT)); + SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + Csts.clear(); + } + + // The result is on the least significant 6-bits on i32 and 7-bits on i64. + SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); + SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F); + SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + } + And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + return And; +} + static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); @@ -19148,15 +16867,15 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); - case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); @@ -19243,6 +16962,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. + case X86ISD::FMINC: + case X86ISD::FMIN: + case X86ISD::FMAXC: + case X86ISD::FMAX: { + EVT VT = N->getValueType(0); + if (VT != MVT::v2f32) + llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + SDValue UNDEF = DAG.getUNDEF(VT); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); + return; + } case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -19599,6 +17334,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; + case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; + case X86ISD::EXPAND: return "X86ISD::EXPAND"; + case X86ISD::SELECT: return "X86ISD::SELECT"; + case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; } } @@ -19747,6 +17492,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } + bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) @@ -19783,68 +17530,20 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (!VT.isSimple()) return false; - MVT SVT = VT.getSimpleVT(); - // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) return false; - // If this is a single-input shuffle with no 128 bit lane crossings we can - // lower it into pshufb. - if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || - (SVT.is256BitVector() && Subtarget->hasInt256())) { - bool isLegal = true; - for (unsigned I = 0, E = M.size(); I != E; ++I) { - if (M[I] >= (int)SVT.getVectorNumElements() || - ShuffleCrosses128bitLane(SVT, I, M[I])) { - isLegal = false; - break; - } - } - if (isLegal) - return true; - } - - // FIXME: blends, shifts. - return (SVT.getVectorNumElements() == 2 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isMOVLMask(M, SVT) || - isMOVHLPSMask(M, SVT) || - isSHUFPMask(M, SVT) || - isSHUFPMask(M, SVT, /* Commuted */ true) || - isPSHUFDMask(M, SVT) || - isPSHUFDMask(M, SVT, /* SecondOperand */ true) || - isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || - isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || - isPALIGNRMask(M, SVT, Subtarget) || - isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || - isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || - isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || - (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); + // We only care that the types being shuffled are legal. The lowering can + // handle any possible shuffle mask that results. + return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const { - if (!VT.isSimple()) - return false; - - MVT SVT = VT.getSimpleVT(); - unsigned NumElts = SVT.getVectorNumElements(); - // FIXME: This collection of masks seems suspect. - if (NumElts == 2) - return true; - if (NumElts == 4 && SVT.is128BitVector()) { - return (isMOVLMask(Mask, SVT) || - isCommutedMOVLMask(Mask, SVT, true) || - isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true) || - isBlendMask(Mask, SVT, Subtarget->hasSSE41(), - Subtarget->hasInt256())); - } - return false; + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(Mask, VT); } //===----------------------------------------------------------------------===// @@ -19982,11 +17681,10 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } -static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII, - const X86Subtarget* Subtarget) { +static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); - + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; @@ -20008,9 +17706,8 @@ static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, } MachineBasicBlock * -X86TargetLowering::EmitVAARG64WithCustomInserter( - MachineInstr *MI, - MachineBasicBlock *MBB) const { +X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: @@ -20040,7 +17737,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -20192,7 +17889,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB - BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } @@ -20296,7 +17993,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -20306,7 +18003,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget->isTargetWin64()) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } @@ -20379,7 +18076,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -20405,8 +18102,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo *TRI = - BB->getParent()->getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -20448,7 +18144,7 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -20510,7 +18206,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -20518,13 +18214,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); - BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. - const uint32_t *RegMask = MF->getTarget() - .getSubtargetImpl() - ->getRegisterInfo() - ->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = + Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -20557,7 +18251,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); - BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); @@ -20581,52 +18275,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMacho()); - - // The lowering is pretty easy: we're just emitting the call to _alloca. The - // non-trivial part is impdef of ESP. - - if (Subtarget->isTargetWin64()) { - if (Subtarget->isTargetCygMing()) { - // ___chkstk(Mingw64): - // Clobbers R10, R11, RAX and EFLAGS. - // Updates RSP. - BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) - .addExternalSymbol("___chkstk") - .addReg(X86::RAX, RegState::Implicit) - .addReg(X86::RSP, RegState::Implicit) - .addReg(X86::RAX, RegState::Define | RegState::Implicit) - .addReg(X86::RSP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - } else { - // __chkstk(MSVCRT): does not update stack pointer. - // Clobbers R10, R11 and EFLAGS. - BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) - .addExternalSymbol("__chkstk") - .addReg(X86::RAX, RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to be subtracted from RSP. - BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(X86::RAX); - } - } else { - const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || - Subtarget->isTargetWindowsItanium()) - ? "_chkstk" - : "_alloca"; + assert(!Subtarget->isTargetMachO()); - BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol(StackProbeSymbol) - .addReg(X86::EAX, RegState::Implicit) - .addReg(X86::ESP, RegState::Implicit) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::ESP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - } + X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -20640,8 +18293,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); @@ -20650,10 +18302,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. - const uint32_t *RegMask = F->getTarget() - .getSubtargetImpl() - ->getRegisterInfo() - ->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = + Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -20698,7 +18348,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -20739,6 +18389,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // v = phi(main, restore) // // restoreMBB: + // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; @@ -20804,8 +18455,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - MF->getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -20822,8 +18472,20 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: + if (RegInfo->hasBasePointer(*MF)) { + const bool Uses64BitFramePtr = + Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + X86FI->setRestoreBasePointer(MF); + unsigned FramePtr = RegInfo->getFrameRegister(*MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; + addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .setMIFlag(MachineInstr::FrameSetup); + } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); - BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI->eraseFromParent(); @@ -20835,7 +18497,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -20850,8 +18512,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - MF->getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -20895,7 +18556,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer -// to remove extra copies in the loop. +// to remove extra copies in the loop. MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -20970,7 +18631,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, default: llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) @@ -20993,6 +18654,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: + case X86::TAILJMPd64_REX: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: @@ -21035,7 +18699,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" @@ -21119,7 +18783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -21132,16 +18796,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), - Subtarget); + return EmitMonitor(MI, BB, Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -21157,6 +18820,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); + case TargetOpcode::STATEPOINT: + // As an implementation detail, STATEPOINT shares the STACKMAP format at + // this point in the process. We diverge later. + return emitPatchPoint(MI, BB); + case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); @@ -22118,9 +19786,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. - if (!(isShuffleEquivalent(Mask, 0, 3) || - isShuffleEquivalent(Mask, 0, 5, 2, 7) || - isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); // Only specific types are legal at this point, assert so we notice if and @@ -22176,7 +19844,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); - + if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { @@ -22304,7 +19972,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, : InVec.getOperand(1); // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = InVec.getNumOperands() > 1 && + InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. @@ -22349,9 +20018,30 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are +/// special and don't usually play with other vector types, it's better to +/// handle them early to be sure we emit efficient code by avoiding +/// store-load conversions. +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::x86mmx || + N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || + N->getOperand(0)->getValueType(0) != MVT::v2i32) + return SDValue(); + + SDValue V = N->getOperand(0); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), + N->getValueType(0), V.getOperand(0)); + + return SDValue(); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts -/// to a simple store and scalar loads to extract the elements. +/// into a somewhat faster sequence. For i686, the best sequence is apparently +/// storing the value and loading scalars back, while for x64 we should +/// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); @@ -22360,14 +20050,29 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, SDValue InputVector = N->getOperand(0); - // Detect whether we are trying to convert from mmx to i32 and the bitcast - // from mmx to v2i32 has a single usage. - if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && - InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && - InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - InputVector.getNode()->getOperand(0)); + // Detect mmx to i32 conversion through a v2i32 elt extract. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + N->getValueType(0) == MVT::i32 && + InputVector.getValueType() == MVT::v2i32) { + + // The bitcast source is a direct mmx result. + SDValue MMXSrc = InputVector.getNode()->getOperand(0); + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); + + // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && + MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && + MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), + MMXSrcOp.getOperand(0)); + } // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. @@ -22410,36 +20115,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Ok, we've now decided to do the transformation. + // If 64-bit shifts are legal, use the extract-shift sequence, + // otherwise bounce the vector off the cache. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Vals[4]; SDLoc dl(InputVector); - // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo(), false, false, 0); + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { + SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(0, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(1, VecIdxTy)); + + SDValue ShAmt = DAG.getConstant(32, + DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); + Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); + Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); + } else { + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); - // Replace each use (extract) with a load of the appropriate element. - for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), - UE = Uses.end(); UI != UE; ++UI) { - SDNode *Extract = *UI; + EVT ElementType = InputVector.getValueType().getVectorElementType(); + unsigned EltSize = ElementType.getSizeInBits() / 8; - // cOMpute the element's address. - SDValue Idx = Extract->getOperand(1); - unsigned EltSize = - InputVector.getValueType().getVectorElementType().getSizeInBits()/8; - uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + // Replace each use (extract) with a load of the appropriate element. + for (unsigned i = 0; i < 4; ++i) { + uint64_t Offset = EltSize * i; + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); + + // Load the scalar. + Vals[i] = DAG.getLoad(ElementType, dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + } + } - // Load the scalar. - SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, - ScalarAddr, MachinePointerInfo(), - false, false, false, 0); + // Replace the extracts + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; - // Replace the exact with the load. - DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + SDValue Idx = Extract->getOperand(1); + uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. @@ -22456,6 +20186,21 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { default: return std::make_pair(0, false); + case MVT::v4i64: + case MVT::v2i64: + if (!Subtarget->hasVLX()) + return std::make_pair(0, false); + break; + case MVT::v64i8: + case MVT::v32i16: + if (!Subtarget->hasBWI()) + return std::make_pair(0, false); + break; + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget->hasAVX512()) + return std::make_pair(0, false); + break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: @@ -22522,7 +20267,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, } static SDValue -TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, +transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); @@ -22535,18 +20280,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, Cond = CondSrc->getOperand(0); } - MVT VT = N->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); @@ -22560,6 +20293,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) return SDValue(); + MVT VT = N->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. @@ -22569,6 +20304,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } @@ -22589,8 +20327,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). + // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && TLI.isTypeLegal(VT) && + VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -23008,96 +20747,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to fold this VSELECT into a MOVSS/MOVSD - if (N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { - if (VT == MVT::v4i32 || VT == MVT::v4f32 || - (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { - bool CanFold = false; - unsigned NumElems = Cond.getNumOperands(); - SDValue A = LHS; - SDValue B = RHS; - - if (isZero(Cond.getOperand(0))) { - CanFold = true; - - // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) - // fold (vselect <0,-1> -> (movsd A, B) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isAllOnes(Cond.getOperand(i)); - } else if (isAllOnes(Cond.getOperand(0))) { - CanFold = true; - std::swap(A, B); - - // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) - // fold (vselect <-1,0> -> (movsd B, A) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isZero(Cond.getOperand(i)); - } - - if (CanFold) { - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); - return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); - } - - if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { - // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), - // (v2i64 (bitcast B))))) - // - // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), - // (v2f64 (bitcast B))))) - // - // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), - // (v2i64 (bitcast A))))) - // - // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), - // (v2f64 (bitcast A))))) - - CanFold = (isZero(Cond.getOperand(0)) && - isZero(Cond.getOperand(1)) && - isAllOnes(Cond.getOperand(2)) && - isAllOnes(Cond.getOperand(3))); - - if (!CanFold && isAllOnes(Cond.getOperand(0)) && - isAllOnes(Cond.getOperand(1)) && - isZero(Cond.getOperand(2)) && - isZero(Cond.getOperand(3))) { - CanFold = true; - std::swap(LHS, RHS); - } - - if (CanFold) { - EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; - SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); - SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); - SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, - NewB, DAG); - return DAG.getNode(ISD::BITCAST, DL, VT, Select); - } - } - } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize()) { + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; } - // If we know that this node is legal then we know that it is going to be - // matched by one of the SSE/AVX BLEND instructions. These instructions only - // depend on the highest bit in each word. Try to use SimplifyDemandedBits - // to simplify previous instructions. + // If this is a *dynamic* select (non-constant condition) and we can match + // this node with one of the variable blend instructions, restructure the + // condition so that the blends can use the high bit of each element and use + // SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && - // We explicitly check against v8i16 and v16i16 because, although - // they're marked as Custom, they might only be legal when Cond is a - // build_vector of constants. This will be taken care in a later - // condition. - (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16) && - // Don't optimize vector of constants. Those are handled by - // the generic code and all the bits must be properly set for - // the generic optimizer. !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); @@ -23105,6 +20779,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (BitWidth == 1) return SDValue(); + // We can only handle the cases where VSELECT is directly legal on the + // subtarget. We custom lower VSELECT nodes with constant conditions and + // this makes it hard to see whether a dynamic VSELECT will correctly + // lower, so we both check the operation's status and explicitly handle the + // cases where a *dynamic* blend will fail even though a constant-condition + // blend could be custom lowered. + // FIXME: We should find a better way to handle this class of problems. + // Potentially, we should combine constant-condition vselect nodes + // pre-legalization into shuffles and not mark as many types as custom + // lowered. + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + // FIXME: We don't support i16-element blends currently. We could and + // should support them by making *all* the bits in the condition be set + // rather than just the high bit and using an i8-element blend. + if (VT.getScalarType() == MVT::i16) + return SDValue(); + // Dynamic blending was only available from SSE4.1 onward. + if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) + return SDValue(); + // Byte blends are only available in AVX2 + if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && + !Subtarget->hasAVX2()) + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -23153,25 +20852,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // We should generate an X86ISD::BLENDI from a vselect if its argument - // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of - // constants. This specific pattern gets generated when we split a - // selector for a 512 bit vector in a machine without AVX512 (but with - // 256-bit vectors), during legalization: - // - // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) - // - // Iff we find this pattern and the build_vectors are built from - // constants, we translate the vselect into a shuffle_vector that we - // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if ((N->getOpcode() == ISD::VSELECT || - N->getOpcode() == X86ISD::SHRUNKBLEND) && - !DCI.isBeforeLegalize()) { - SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); - if (Shuffle.getNode()) - return Shuffle; - } - return SDValue(); } @@ -23524,7 +21204,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, // fold (blend A, B, allOnes) -> B if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Op1; - + // Simplify the case where the mask is a constant i32 value. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { if (C->isNullValue()) @@ -23590,7 +21270,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - if (VT != MVT::i64) + if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); @@ -23948,24 +21628,118 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, } } +static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // A vector zext_in_reg may be represented as a shuffle, + // feeding into a bitcast (this represents anyext) feeding into + // an and with a mask. + // We'd like to try to combine that into a shuffle with zero + // plus a bitcast, removing the and. + if (N0.getOpcode() != ISD::BITCAST || + N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + // The other side of the AND should be a splat of 2^C, where C + // is the number of bits in the source type. + if (N1.getOpcode() == ISD::BITCAST) + N1 = N1.getOperand(0); + if (N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); + + ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); + EVT SrcType = Shuffle->getValueType(0); + + // We expect a single-source shuffle + if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) + return SDValue(); + + unsigned SrcSize = SrcType.getScalarSizeInBits(); + + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!Vector->isConstantSplat(SplatValue, SplatUndef, + SplatBitSize, HasAnyUndefs)) + return SDValue(); + + unsigned ResSize = N1.getValueType().getScalarSizeInBits(); + // Make sure the splat matches the mask we expect + if (SplatBitSize > ResSize || + (SplatValue + 1).exactLogBase2() != (int)SrcSize) + return SDValue(); + + // Make sure the input and output size make sense + if (SrcSize >= ResSize || ResSize % SrcSize) + return SDValue(); + + // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> + // The number of u's between each two values depends on the ratio between + // the source and dest type. + unsigned ZextRatio = ResSize / SrcSize; + bool IsZext = true; + for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { + if (i % ZextRatio) { + if (Shuffle->getMaskElt(i) > 0) { + // Expected undef + IsZext = false; + break; + } + } else { + if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { + // Expected element number + IsZext = false; + break; + } + } + } + + if (!IsZext) + return SDValue(); + + // Ok, perform the transformation - replace the shuffle with + // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero + // (instead of undef) where the k elements come from the zero vector. + SmallVector<int, 8> Mask; + unsigned NumElems = SrcType.getVectorNumElements(); + for (unsigned i = 0; i < NumElems; ++i) + if (i % ZextRatio) + Mask.push_back(NumElems); + else + Mask.push_back(i / ZextRatio); + + SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, + Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask); + return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget); + if (Zext.getNode()) + return Zext; + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); if (R.getNode()) return R; + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -23975,7 +21749,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, uint64_t Mask = MaskNode->getZExtValue(); uint64_t Shift = ShiftNode->getZExtValue(); if (isMask_64(Mask)) { - uint64_t MaskSize = CountPopulation_64(Mask); + uint64_t MaskSize = countPopulation(Mask); if (Shift + MaskSize <= VT.getSizeInBits()) return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), DAG.getConstant(Shift | (MaskSize << 8), VT)); @@ -23993,10 +21767,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (VT != MVT::v2i64 && VT != MVT::v4i64) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) @@ -24108,8 +21878,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -24233,11 +22003,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // On Sandybridge unaligned 256bit loads are inefficient. + // For chips with slow 32-byte unaligned loads, break the 32-byte operation + // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) @@ -24270,6 +22041,166 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// PerformMLOADCombine - Resolve extending loads +static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + if (Mld->getExtensionType() != ISD::SEXTLOAD) + return SDValue(); + + EVT VT = Mld->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getVectorElementType().getSizeInBits(); + unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert Src0 value + SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0()); + if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) + && "WideVecVT should be legal"); + WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + } + // Prepare the new mask + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WideSrc0, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + +} +/// PerformMSTORECombine - Resolve truncating stores +static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); + if (!Mst->isTruncatingStore()) + return SDValue(); + + EVT VT = Mst->getValue().getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); + + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue()); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) + && "WideVecVT should be legal"); + + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + SDValue NewMask; + SDValue Mask = Mst->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), + NewMask, StVT, Mst->getMemOperand(), false); +} /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -24280,13 +22211,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatenation of two XMM registers, perform two stores. - // On Sandy Bridge, 256-bit memory operations are executed by two - // 128-bit ports. However, on Haswell it is better to issue a single 256-bit - // memory operation. + // If we are saving a concatenation of two XMM registers and 32-byte stores + // are slow, such as on Sandy Bridge, perform two 16-byte stores. unsigned Alignment = St->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && !Subtarget->hasInt256() && + if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) @@ -24352,9 +22281,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Find the largest store unit MVT StoreType = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; + for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } @@ -24399,8 +22326,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || @@ -24500,7 +22426,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, @@ -24626,7 +22552,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return true; } -/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +/// Do target-specific dag combines on floating point adds. static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -24641,7 +22567,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +/// Do target-specific dag combines on floating point subs. static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -24656,23 +22582,23 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and -/// X86ISD::FXOR nodes. +/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + // F[X]OR(0.0, x) -> x - // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); return SDValue(); } -/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and -/// X86ISD::FMAX nodes. +/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); @@ -24693,29 +22619,33 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } -/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. +/// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 - // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); + + // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } -/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +/// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { - // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // FANDN(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } @@ -24978,6 +22908,23 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); @@ -24989,20 +22936,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, if (MayFoldLoad(Ld)) { // Extract the countS bits from the immediate so we can get the proper // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, interps doesn't use + // When the second source op is a memory address, insertps doesn't use // countS and just gets an f32 from that address. unsigned DestIndex = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); - } else - return SDValue(); - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); + } + return SDValue(); +} + +static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); } // Helper function of PerformSETCCCombine. It is to materialize "setb reg" @@ -25134,7 +23108,7 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, } static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, - const X86TargetLowering *XTLI) { + const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); @@ -25160,10 +23134,9 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, EVT VT = Ld->getValueType(0); if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !XTLI->getSubtarget()->is64Bit() && - VT == MVT::i64) { - SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), - Ld->getChain(), Op0, DAG); + !Subtarget->is64Bit() && VT == MVT::i64) { + SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( + SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } @@ -25362,6 +23335,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -25374,8 +23348,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); + case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); - case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -25414,8 +23390,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: - return PerformINSERTPSCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) + return PerformINSERTPSCombine(N, DAG, Subtarget); + break; + } + case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } @@ -25841,6 +23821,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'L': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || + (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { + Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); + break; + } + } + return; + case 'M': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 3) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'N': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 255) { @@ -25849,6 +23846,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'O': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 127) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -25938,8 +23943,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -std::pair<unsigned, const TargetRegisterClass*> -X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, +std::pair<unsigned, const TargetRegisterClass *> +X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. @@ -26045,7 +24051,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass*> Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { @@ -26193,7 +24199,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. - // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(AM, Ty)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7c6ffa2..4423015 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -158,6 +158,10 @@ namespace llvm { /// vector to a GPR. MMX_MOVD2W, + /// MMX_MOVW2D - Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, @@ -197,7 +201,12 @@ namespace llvm { /// ADDSUB - Combined add and sub on an FP vector. ADDSUB, - + // FADD, FSUB, FMUL, FDIV, FMIN, FMAX - FP vector ops with rounding mode. + FADD_RND, + FSUB_RND, + FMUL_RND, + FDIV_RND, + // SUBUS - Integer sub with unsigned saturation. SUBUS, @@ -378,6 +387,18 @@ namespace llvm { FNMSUB, FMADDSUB, FMSUBADD, + // FMA with rounding mode + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + RNDSCALE, + + // Compress and expand + COMPRESS, + EXPAND, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. @@ -543,7 +564,8 @@ namespace llvm { // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { public: - explicit X86TargetLowering(const X86TargetMachine &TM); + explicit X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI); unsigned getJumpTableEncoding() const override; @@ -629,6 +651,10 @@ namespace llvm { /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool isCheapToSpeculateCttz() const override; + + bool isCheapToSpeculateCtlz() const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; @@ -675,9 +701,10 @@ namespace llvm { /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On /// error, this returns a register number of 0. - std::pair<unsigned, const TargetRegisterClass*> - getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, + MVT VT) const override; /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. @@ -724,6 +751,10 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + bool isVectorLoadExtDesirable(SDValue) const override; + /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. @@ -762,9 +793,10 @@ namespace llvm { return !X86ScalarSSEf64 || VT == MVT::f80; } - const X86Subtarget* getSubtarget() const { - return Subtarget; - } + /// Return true if we believe it is correct and profitable to reduce the + /// load node to a smaller type. + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, + EVT NewVT) const override; /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. @@ -787,6 +819,10 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. @@ -810,16 +846,14 @@ namespace llvm { bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; - /// \brief Reset the operation actions based on target options. - void resetOperationActions() override; - bool useLoadStackGuardNode() const override; /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; protected: - std::pair<const TargetRegisterClass*, uint8_t> - findRepresentativeClass(MVT VT) const override; + std::pair<const TargetRegisterClass *, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const override; private: /// Keep a pointer to the X86Subtarget around so that we can @@ -827,10 +861,6 @@ namespace llvm { const X86Subtarget *Subtarget; const DataLayout *TD; - /// Used to store the TargetOptions so that we don't waste time resetting - /// the operation actions unless we have to. - TargetOptions TO; - /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. @@ -930,7 +960,6 @@ namespace llvm { SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b188cd5..4923bc5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,10 +1,27 @@ +//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 AVX512 instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + // Group template arguments that can be derived from the vector type (EltNum x // EltVT). These are things like the register class for the writemask, etc. // The idea is to pass one of these as the template argument rather than the // individual arguments. -class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc, +// The template is also used for scalar types, in this case numelts is 1. +class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, string suffix = ""> { RegisterClass RC = rc; + ValueType EltVT = eltvt; int NumElts = numelts; // Corresponding mask register class. @@ -23,7 +40,13 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc, // Suffix used in the instruction mnemonic. string Suffix = suffix; - string VTName = "v" # NumElts # EltVT; + // VTName is a string name for vector VT. For vector types it will be + // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32 + // It is a little bit complex for scalar types, where NumElts = 1. + // In this case we build v4f32 or v2f64 + string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 32), 4, + !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; // The vector VT. ValueType VT = !cast<ValueType>(VTName); @@ -53,14 +76,6 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc, VTName)), VTName)); PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); - // Load patterns used for memory operands. We only have this defined in - // case of i64 element types for sub-512 integer vectors. For now, keep - // MemOpFrag undefined in these cases. - PatFrag MemOpFrag = - !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName), - !if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName), - !if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?))); - // The corresponding float type, e.g. v16f32 for v16i32 // Note: For EltSize < 32, FloatVT is illegal and TableGen // fails to compile, so we choose FloatVT = VT @@ -86,6 +101,8 @@ class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc, !if (!eq (EltTypeName, "f64"), SSEPackedDouble, SSEPackedInt)); + RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + // A vector type of the same width with element type i32. This is used to // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); @@ -114,6 +131,11 @@ def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; + class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256, X86VectorVTInfo i128> { X86VectorVTInfo info512 = i512; @@ -183,7 +205,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - string Round = "", + SDNode Select = vselect, string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : @@ -192,11 +214,11 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, RHS)], [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, - (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], Round, MaskingConstraint, NoItinerary, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and -// the zero-masking variant of the instruction. In the masking case, the +// the zero-masking variant of the vector instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, @@ -208,8 +230,23 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round, - "$src0 = $dst", itin, IsCommutable>; + (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, + Round, "$src0 = $dst", itin, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select, + Round, "$src0 = $dst", itin, IsCommutable>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -364,7 +401,7 @@ multiclass vinsert_for_size_no_alt<int Opcode, SDNodeXForm INSERT_get_vinsert_imm> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, From.RC:$src2, i8imm:$src3), + (ins VR512:$src1, From.RC:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts # "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}", @@ -375,7 +412,7 @@ multiclass vinsert_for_size_no_alt<int Opcode, let mayLoad = 1 in def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3), + (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts # "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}", @@ -437,12 +474,12 @@ defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -459,7 +496,7 @@ multiclass vextract_for_size<int Opcode, SDNodeXForm EXTRACT_get_vextract_imm> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins VR512:$src1, i8imm:$idx), + (ins VR512:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x4", "$idx, $src1", "$src1, $idx", [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), @@ -467,7 +504,7 @@ multiclass vextract_for_size<int Opcode, AVX512AIi8Base, EVEX, EVEX_V512; let mayStore = 1 in def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2), + (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2), "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" "$dst, $src1, $src2}", []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; @@ -566,13 +603,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, i32i8imm:$src2), + (ins VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -622,6 +659,45 @@ let ExeDomain = SSEPackedDouble in { avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; } +// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. +// Later, we can canonize broadcast instructions before ISel phase and +// eliminate additional patterns on ISel. +// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar +// representations of source +multiclass avx512_broadcast_pat<string InstName, SDNode OpNode, + X86VectorVTInfo _, RegisterClass SrcRC_v, + RegisterClass SrcRC_s> { + def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))), + (!cast<Instruction>(InstName##"r") + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)), + (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + def : Pat<(_.VT(vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)), + (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + } +} + +defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info, + VR128X, FR32X>; +defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info, + VR128X, FR64X>; + +let Predicates = [HasVLX] in { + defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast, + v8f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast, + v4f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast, + v4f64x_info, VR128X, FR64X>; +} + def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSZm addr:$src)>; def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), @@ -632,74 +708,84 @@ def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), (VBROADCASTSDZm addr:$src)>; -multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, - RegisterClass SrcRC, RegisterClass KRC> { - def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V512; - def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), - (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, EVEX, EVEX_V512, EVEX_KZ; -} - -defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; -defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, - VEX_W; - +multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _, + RegisterClass SrcRC> { + defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), "vpbroadcast"##_.Suffix, + "$src", "$src", []>, T8PD, EVEX; +} + +multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32, + HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, + HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, + HasAVX512>, VEX_W; + def : Pat <(v16i32 (X86vzext VK16WM:$mask)), - (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; def : Pat <(v8i64 (X86vzext VK8WM:$mask)), - (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>; + (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>; + (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), (v16i32 immAllZerosV), (i16 GR16:$mask))), - (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; + (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), - (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; + (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, RegisterClass KRC> { def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, VR128X:$src), - !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; } } @@ -716,12 +802,12 @@ multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass KRC> { let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; } } @@ -752,7 +838,7 @@ def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), (VBROADCASTSSZr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), (VBROADCASTSDZr VR128X:$src)>; - + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), @@ -763,7 +849,7 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), let Predicates = [HasAVX512] in { def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), addr:$src)), sub_ymm)>; } @@ -775,15 +861,15 @@ multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, RegisterClass KRC> { let Predicates = [HasCDI] in def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX, EVEX_V512; - + let Predicates = [HasCDI, HasVLX] in { def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX, EVEX_V128; def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX, EVEX_V256; } } @@ -803,18 +889,18 @@ multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, i8imm:$src2), + (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.MemOp:$src1, i8imm:$src2), + (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, - (_.VT (OpNode (_.MemOpFrag addr:$src1), + (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))))]>, EVEX, EVEX_CD8<_.EltSize, CD8VF>; } @@ -827,7 +913,7 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat("vpermil" # _.Suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86VPermilpv _.RC:$src1, (Ctrl.VT Ctrl.RC:$src2))))]>, @@ -835,10 +921,10 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, Ctrl.MemOp:$src2), !strconcat("vpermil" # _.Suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>, + (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>, EVEX_4V; } } @@ -859,34 +945,34 @@ def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- -multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, +multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V; def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>, EVEX_4V; } -defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, loadv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, loadv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in -defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, loadv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, loadv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -897,7 +983,7 @@ let Constraints = "$src1 = $dst" in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, EVEX_4V; @@ -905,7 +991,7 @@ let Constraints = "$src1 = $dst" in { def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}}|" + "\t{$src3, $src2, $dst {${mask}}|" "$dst {${mask}}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, (OpNode RC:$src1, RC:$src2, @@ -917,7 +1003,7 @@ let Constraints = "$src1 = $dst" in { def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}} {z} |", + "\t{$src3, $src2, $dst {${mask}} {z} |", "$dst {${mask}} {z}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, (OpNode RC:$src1, RC:$src2, @@ -929,7 +1015,7 @@ let Constraints = "$src1 = $dst" in { def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, (mem_frag addr:$src3))))]>, EVEX_4V; @@ -937,7 +1023,7 @@ let Constraints = "$src1 = $dst" in { def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}}|" + "\t{$src3, $src2, $dst {${mask}}|" "$dst {${mask}}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, @@ -950,7 +1036,7 @@ let Constraints = "$src1 = $dst" in { def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}} {z}|" + "\t{$src3, $src2, $dst {${mask}} {z}|" "$dst {${mask}} {z}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, @@ -961,16 +1047,16 @@ let Constraints = "$src1 = $dst" in { EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, loadv16i32, i512mem, X86VPermiv3, v16i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, loadv8i64, i512mem, X86VPermiv3, v8i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, loadv16f32, i512mem, X86VPermiv3, v16f32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, loadv8f64, i512mem, X86VPermiv3, v8f64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -990,93 +1076,126 @@ multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC, (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; } -defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, loadv16i32, i512mem, X86VPermv3, v16i32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, loadv8i64, i512mem, X86VPermv3, v8i64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, loadv16f32, i512mem, X86VPermv3, v16f32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, loadv8f64, i512mem, X86VPermv3, v8f64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // -multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, PatFrag mem_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), +multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), - (vt RC:$src1)))]>, EVEX_4V, EVEX_K; - let mayLoad = 1 in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V; + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - []>, EVEX_4V, EVEX_K; + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } } +multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + +} + +multiclass blendmask_dq <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + +multiclass blendmask_bw <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + + +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let ExeDomain = SSEPackedSingle in -defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; -let ExeDomain = SSEPackedDouble in -defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (i16 GR16:$mask))), - (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (i8 GR8:$mask))), - (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), - VR512:$src1, VR512:$src2)>; - -defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; - -defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (i16 GR16:$mask))), - (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (i8 GR8:$mask))), - (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), - VR512:$src1, VR512:$src2)>; let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } @@ -1086,35 +1205,40 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), // avx512_cmp_scalar - AVX512 CMPSS and CMPSD multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - Operand CC, SDNode OpNode, ValueType VT, - PatFrag ld_frag, string asm, string asm_alt> { + SDNode OpNode, ValueType VT, + PatFrag ld_frag, string Suffix> { def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], IIC_SSE_ALU_F32S_RR>, EVEX_4V; def rm : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VK1:$dst, (OpNode (VT RC:$src1), (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; + (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + !strconcat("vcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; + let mayLoad = 1 in def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), + !strconcat("vcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; } } let Predicates = [HasAVX512] in { -defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, AVXCC, X86cmpms, f32, loadf32, - "vcmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vcmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - XS; -defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64, - "vcmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vcmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - XD, VEX_W; +defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">, + XS; +defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">, + XD, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -1249,7 +1373,7 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), @@ -1257,7 +1381,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RR>, EVEX_4V; let mayLoad = 1 in def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), @@ -1266,7 +1390,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), @@ -1277,7 +1401,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, let mayLoad = 1 in def rmik : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), @@ -1290,25 +1414,27 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rmi_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik_alt : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in def rmik_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), @@ -1319,10 +1445,9 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> : avx512_icmp_cc<opc, Suffix, OpNode, _> { - let mayLoad = 1 in { def rmib : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, "}"), @@ -1332,7 +1457,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; def rmibk : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, AVXCC:$cc), + _.ScalarMemOp:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), @@ -1341,20 +1466,19 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)))], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; - } // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { + let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { def rmib_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, i8imm:$cc), + _.ScalarMemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), @@ -1414,30 +1538,32 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, def rri : AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>; + let hasSideEffects = 0 in def rrib: AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), + "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), [], d>, EVEX_B; def rmi : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set KRC:$dst, - (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; + (X86cmpm (vt RC:$src1), (load addr:$src2), imm:$cc))], d>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + let mayLoad = 1 in def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } } @@ -1465,25 +1591,25 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), imm:$cc), VK8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i8immZExt5:$cc, (i16 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; - + def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i8immZExt5:$cc, (i8 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i8immZExt5:$cc, (i16 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i8immZExt5:$cc, (i8 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; @@ -1495,17 +1621,18 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), // multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, - ValueType vvt, ValueType ivt, X86MemOperand x86memop> { + ValueType vvt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vvt (load addr:$src)))]>; let mayStore = 1 in def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store KRC:$src, addr:$dst)]>; } } @@ -1514,34 +1641,32 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } let Predicates = [HasDQI] in - defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8, - i8mem>, + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, VEX, PD; let Predicates = [HasAVX512] in - defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16, - i16mem>, + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, VEX, PS; let Predicates = [HasBWI] in { - defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32, - i32mem>, VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>, + VEX, PD, VEX_W; defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, VEX, XD; } let Predicates = [HasBWI] in { - defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64, - i64mem>, VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, VEX, XD, VEX_W; } @@ -1572,24 +1697,34 @@ let Predicates = [HasBWI] in { let Predicates = [HasDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVBmk addr:$dst, VK8:$src)>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (KMOVBkm addr:$src)>; +} +let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; } let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), - (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; - def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), - (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; + def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), + (KMOVWkm addr:$src)>; } let Predicates = [HasBWI] in { def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), (KMOVDmk addr:$dst, VK32:$src)>; + def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))), + (KMOVDkm addr:$src)>; } let Predicates = [HasBWI] in { def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), (KMOVQmk addr:$dst, VK64:$src)>; + def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))), + (KMOVQkm addr:$src)>; } let Predicates = [HasAVX512] in { @@ -1666,7 +1801,7 @@ multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, Predicate prd> { let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set KRC:$dst, (OpNode KRC:$src))]>; } @@ -1703,7 +1838,7 @@ let Predicates = [HasBWI] in def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512, NoDQI] in { def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; @@ -1720,7 +1855,7 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; } @@ -1796,7 +1931,7 @@ multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX512] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { @@ -1825,35 +1960,50 @@ multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1|$src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; } multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, VEX, PS; + let Predicates = [HasDQI] in + defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>, + VEX, PD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>, + VEX, PD, VEX_W; + } } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; -def : Pat<(X86cmp VK1:$src1, (i1 0)), - (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src1, VK16))>; - // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512] in - def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), !strconcat(OpcodeStr, - " \t{$imm, $src, $dst|$dst, $src, $imm}"), + "\t{$imm, $src, $dst|$dst, $src, $imm}"), [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; } multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, TAPD, VEX_W; + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>, + VEX, TAPD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>, + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, + VEX, TAPD; + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -1904,10 +2054,14 @@ let Predicates = [HasVLX] in { } def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // @@ -2001,7 +2155,7 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, ValueType OpVT, RegisterClass KRC, RegisterClass RC, X86MemOperand memop, Domain d> { - let isAsmParserOnly = 1, hasSideEffects = 0 in { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, EVEX; @@ -2088,6 +2242,22 @@ def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), + (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), + (VMOVAPDZrm addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), + (VMOVAPSZrm addr:$ptr)>; + def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), GR16:$mask), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), @@ -2097,6 +2267,55 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src), + GR16:$mask), + (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), + GR8:$mask), + (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", "16", "8", "4", SSEPackedInt, HasAVX512>, avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", @@ -2171,6 +2390,46 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), @@ -2277,12 +2536,12 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, RegisterClass RC, +multiclass avx512_move_scalar <string asm, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, PatFrag mem_pat> { let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128X:$dst, (vt (OpNode VR128X:$src1, (scalar_to_vector RC:$src2))))], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; @@ -2290,19 +2549,19 @@ multiclass avx512_move_scalar <string asm, RegisterClass RC, def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), !strconcat(asm, - " \t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), + "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, EVEX, VEX_LIG; let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), - !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG, EVEX_K; } // mayStore @@ -2359,7 +2618,7 @@ let Predicates = [HasAVX512] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (V_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), @@ -2488,7 +2747,7 @@ let AddedComplexity = 15 in def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (v2i64 (X86vzmovl + [(set VR128X:$dst, (v2i64 (X86vzmovl (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; @@ -2510,7 +2769,7 @@ let Predicates = [HasAVX512] in { (VMOV64toPQIZrr GR64:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIZrr GR32:$src)>; - + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -2751,48 +3010,48 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, { def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), [], itins.rr>, EVEX_4V, EVEX_K; def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" , + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}} {z}" , "|$dst {${mask}} {z}, $src1, $src2}"), [], itins.rr>, EVEX_4V, EVEX_KZ; } let mayLoad = 1 in { def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), [], itins.rm>, EVEX_4V, EVEX_K; def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), [], itins.rm>, EVEX_4V, EVEX_KZ; def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B; def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K; def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ; @@ -2811,12 +3070,12 @@ defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", + loadv8i64, i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", + loadv8i64, i512mem, loadi64, i64mem, "{1to8}", SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), @@ -2902,16 +3161,16 @@ multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, d>, EVEX_4V; } -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64, +defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64, VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64, +defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64, VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64, +defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64, VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64, +defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64, VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -2920,52 +3179,52 @@ multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, X86MemOperand x86memop> { def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], IIC_SSE_UNPCK>, EVEX_4V; def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (bitconvert (memop_frag addr:$src2)))))], IIC_SSE_UNPCK>, EVEX_4V; } defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, loadv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, loadv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, loadv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, loadv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - PSHUFD // multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, + SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, EVEX; def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + (ins x86memop:$src1, u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode (mem_frag addr:$src1), (i8 imm:$src2))))]>, EVEX; } -defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, +defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, loadv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; //===----------------------------------------------------------------------===// @@ -3027,7 +3286,16 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, }//let mayLoad = 1 } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _, bit IsCommutable> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, + EVEX_4V, EVEX_B, EVEX_RC; +} + +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>, EVEX_V512, PS, @@ -3053,12 +3321,23 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, } } -defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; -defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; +multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, + avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, + avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, + avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, + avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; -defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), @@ -3083,34 +3362,34 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt> { def rr : AVX512PI<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs KRC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], SSEPackedInt>, EVEX_4V; def rm : AVX512PI<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V; } defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, - memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512, + loadv16i32, X86testm, v16i32>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, - memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W, + loadv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let Predicates = [HasCDI] in { defm VPTESTNMDZ : avx512_vptest<0x27, "vptestnmd", VK16, VR512, f512mem, - memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, + loadv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPTESTNMQZ : avx512_vptest<0x27, "vptestnmq", VK8, VR512, f512mem, - memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W, + loadv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -3121,147 +3400,127 @@ def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>; + //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), - (ins _.RC:$src1, i8imm:$src2), OpcodeStr, + (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), - (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr, + (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))), + (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))), " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, ValueType SrcVT, - PatFrag bc_frag, RegisterClass KRC> { - // src2 is always 128-bit - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, VR128X:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))], - SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; - def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, i128mem:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, - (bc_frag (memopv2i64 addr:$src2)))))], - SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; - def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, i128mem:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, VR128X:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, i128mem:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; +} + +multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512; +} + +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, + v16i32_info>, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, + v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; } defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag> { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (vt (OpNode RC:$src1, (vt RC:$src2))))]>, - EVEX_4V; - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>, - EVEX_4V; + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V; } -defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; +multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; +} + +multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, +multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, X86MemOperand x86memop, PatFrag memop_frag> { def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>, +defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), (VMOVDDUPZrm addr:$src)>; @@ -3273,26 +3532,26 @@ multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, ValueType vt, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop> { def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; let mayLoad = 1 in def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; } defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), (VMOVSHDUPZrm addr:$src)>; def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), (VMOVSLDUPZrm addr:$src)>; //===----------------------------------------------------------------------===// @@ -3336,73 +3595,93 @@ multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, _.MemOp:$src3), - !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (_.MemOpFrag addr:$src3))))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, _.ScalarMemOp:$src3), - !strconcat(OpcodeStr, " \t{${src3}", _.BroadcastStr, - ", $src2, $dst|$dst, $src2, ${src3}", _.BroadcastStr, "}"), - [(set _.RC:$dst, (OpNode _.RC:$src1, _.RC:$src2, - (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))]>, EVEX_B; -} + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + AVX512FMA3Base, EVEX_B; + } +} // Constraints = "$src1 = $dst" + +let Constraints = "$src1 = $dst" in { +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3_round_rrb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; + } } // Constraints = "$src1 = $dst" +multiclass avx512_fma3_round_forms<bits<8> opc213, string OpcodeStr, + X86VectorVTInfo VTI, SDPatternOperator OpNode> { + defm v213r : avx512_fma3_round_rrb<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), + VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; +} + multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231, string OpcodeStr, X86VectorVTInfo VTI, SDPatternOperator OpNode> { - defm v213 : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), - VTI, OpNode>, - EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>; + defm v213r : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), + VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; - defm v231 : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), - VTI>, - EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>; + defm v231r : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), + VTI>, EVEX_CD8<VTI.EltSize, CD8VF>; } +multiclass avx512_fma3p<bits<8> opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode, + SDPatternOperator OpNodeRnd> { let ExeDomain = SSEPackedSingle in { - defm VFMADDPSZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v16f32_info, X86Fmadd>; - defm VFMSUBPSZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v16f32_info, X86Fmsub>; - defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v16f32_info, X86Fmaddsub>; - defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v16f32_info, X86Fmsubadd>; - defm VFNMADDPSZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v16f32_info, X86Fnmadd>; - defm VFNMSUBPSZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v16f32_info, X86Fnmsub>; -} + defm NAME##PSZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v16f32_info, OpNode>, + avx512_fma3_round_forms<opc213, OpcodeStr, + v16f32_info, OpNodeRnd>, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v8f32x_info, OpNode>, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v4f32x_info, OpNode>, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADDPDZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v8f64_info, X86Fmadd>, VEX_W; - defm VFMSUBPDZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v8f64_info, X86Fmsub>, VEX_W; - defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v8f64_info, X86Fmaddsub>, VEX_W; - defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v8f64_info, X86Fmsubadd>, VEX_W; - defm VFNMADDPDZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v8f64_info, X86Fnmadd>, VEX_W; - defm VFNMSUBPDZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v8f64_info, X86Fnmsub>, VEX_W; + defm NAME##PDZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v8f64_info, OpNode>, + avx512_fma3_round_forms<opc213, OpcodeStr, + v8f64_info, OpNodeRnd>, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v4f64x_info, OpNode>, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v2f64x_info, OpNode>, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; + let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { let mayLoad = 1 in def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3)))]>; def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2), - !strconcat(OpcodeStr, " \t{${src2}", _.BroadcastStr, + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"), [(set _.RC:$dst, (OpNode _.RC:$src1, (_.VT (X86VBroadcast @@ -3412,65 +3691,54 @@ multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f<bits<8> opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode,v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, +multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, PatFrag mem_frag> { let isCommutable = 1 in def r : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; let mayLoad = 1 in def m : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, f128mem:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, (mem_frag addr:$src3))))]>; @@ -3503,12 +3771,12 @@ multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), - !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), - !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; } // hasSideEffects = 0 } @@ -3576,12 +3844,12 @@ multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstR string asm> { let hasSideEffects = 0 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG, Requires<[HasAVX512]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, Requires<[HasAVX512]>; } // hasSideEffects = 0 } @@ -3679,10 +3947,10 @@ multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm> { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; } @@ -3755,21 +4023,21 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, +multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, + RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [], d>, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; } // hasSideEffects = 0 @@ -3781,29 +4049,29 @@ multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; } // hasSideEffects = 0 } defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround, - memopv8f64, f512mem, v8f32, v8f64, + loadv8f64, f512mem, v8f32, v8f64, SSEPackedSingle>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, - memopv4f64, f256mem, v8f64, v8f32, + loadv4f64, f256mem, v8f64, v8f32, SSEPackedDouble>, EVEX_V512, PS, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; - + def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))), (VCVTPD2PSZrr VR512:$src)>; @@ -3817,27 +4085,27 @@ def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), //===----------------------------------------------------------------------===// defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, - memopv8i64, i512mem, v16f32, v16i32, + loadv8i64, i512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, - memopv4i64, i256mem, v8f64, v8i32, + loadv4i64, i256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, - memopv16f32, f512mem, v16i32, v16f32, + loadv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + loadv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, - memopv16f32, f512mem, v16i32, v16f32, + loadv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -3847,29 +4115,29 @@ def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src), (VCVTTPS2UDQZrr VR512:$src)>; defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, - memopv8f64, f512mem, v8i32, v8f64, + loadv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; - + // cvttpd2udq (src, 0, mask-all-ones, sae-current) def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)), (VCVTTPD2UDQZrr VR512:$src)>; defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, - memopv4i64, f256mem, v8f64, v8i32, + loadv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; - + defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, - memopv16i32, f512mem, v16f32, v16i32, + loadv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3877,7 +4145,7 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3904,23 +4172,23 @@ multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC, X86MemOperand x86memop, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [], d>, EVEX; def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [], d>, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [], d>, EVEX; } // hasSideEffects = 0 } defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, PD, + loadv16f32, f512mem, SSEPackedSingle>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, + loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src), @@ -3932,10 +4200,10 @@ def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src), (VCVTPD2DQZrrb VR512:$src, imm:$rc)>; defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, + loadv16f32, f512mem, SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, VEX_W, + loadv8f64, f512mem, SSEPackedDouble>, VEX_W, PS, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src), @@ -3969,13 +4237,13 @@ multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC, multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, X86MemOperand x86memop> { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), - (ins srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", + (ins srcRC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; + (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512, @@ -4022,7 +4290,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } - + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop> { @@ -4030,12 +4298,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; } } } @@ -4130,60 +4398,40 @@ def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), (VRCP14PDZr VR512:$src)>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd -multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; - def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), - []>, EVEX_4V, EVEX_B; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; - } -} -} - -defm VRCP28SS : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP28SD : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT28SS : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT28SD : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; +multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode> { -def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; -def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; -def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; +} -def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; +multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>, + EVEX_CD8<32, CD8VT1>; + defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>, + EVEX_CD8<64, CD8VT1>, VEX_W; +} +let hasSideEffects = 0, Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; +} /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -4196,12 +4444,14 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), + "{sae}">, EVEX_B; defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", (OpNode (_.FloatVT - (bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>; + (bitconvert (_.LdFrag addr:$src))), + (i32 FROUND_CURRENT))>; defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", @@ -4218,7 +4468,7 @@ multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> { } let Predicates = [HasERI], hasSideEffects = 0 in { - + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; @@ -4257,7 +4507,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XS, EVEX_4V; let mayLoad = 1 in { @@ -4271,7 +4521,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2))], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; } @@ -4285,7 +4535,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XD, EVEX_4V, VEX_W; let mayLoad = 1 in { @@ -4299,8 +4549,8 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2))]>, XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; } } @@ -4332,8 +4582,8 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, SSE_SQRTSS, SSE_SQRTSD>; let Predicates = [HasAVX512] in { @@ -4343,7 +4593,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), (VSQRTPDZr VR512:$src1)>; - + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -4383,107 +4633,6 @@ let Predicates = [HasAVX512] in { } -multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int, - CD8VForm VForm> { -let ExeDomain = SSEPackedSingle in { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def PSr : AVX512AIi8<opcps, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>; - - // Vector intrinsic operation, mem - def PSm : AVX512AIi8<opcps, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, - EVEX_CD8<32, VForm>; -} // ExeDomain = SSEPackedSingle - -let ExeDomain = SSEPackedDouble in { - // Vector intrinsic operation, reg - def PDr : AVX512AIi8<opcpd, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>; - - // Vector intrinsic operation, mem - def PDm : AVX512AIi8<opcpd, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, - EVEX_CD8<64, VForm>; -} // ExeDomain = SSEPackedDouble -} - -multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int> { -let ExeDomain = GenericDomain in { - // Operation, reg. - let hasSideEffects = 0 in - def SSr : AVX512AIi8<opcss, MRMSrcReg, - (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in - def SSr_Int : AVX512AIi8<opcss, MRMSrcReg, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>; - - // Intrinsic operation, mem. - def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, - sse_load_f32:$src2, imm:$src3))]>, - EVEX_CD8<32, CD8VT1>; - - // Operation, reg. - let hasSideEffects = 0 in - def SDr : AVX512AIi8<opcsd, MRMSrcReg, - (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, VEX_W; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in - def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>, - VEX_W; - - // Intrinsic operation, mem. - def SDm : AVX512AIi8<opcsd, MRMSrcMem, - (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>, - VEX_W, EVEX_CD8<64, CD8VT1>; -} // ExeDomain = GenericDomain -} - multiclass avx512_rndscale<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, PatFrag mem_frag, Domain d> { @@ -4491,23 +4640,22 @@ let ExeDomain = d in { // Intrinsic operation, reg. // Vector intrinsic operation, reg def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; // Vector intrinsic operation, mem def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; } // ExeDomain } - defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - memopv16f32, SSEPackedSingle>, EVEX_V512, + loadv16f32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), @@ -4517,7 +4665,7 @@ def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - memopv8f64, SSEPackedDouble>, EVEX_V512, + loadv8f64, SSEPackedDouble>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), @@ -4525,50 +4673,72 @@ def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), FROUND_CURRENT)), (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; -multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, - Operand x86memop, RegisterClass RC, Domain d> { -let ExeDomain = d in { - def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; +multiclass +avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; -} // ExeDomain + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_NO_EXC))), "{sae}">, EVEX_B; + + let mayLoad = 1 in + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScale (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + } + let Predicates = [HasAVX512] in { + def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>; + def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>; + def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>; + + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x1))), _.FRC)>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x3))), _.FRC)>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0xc))), _.FRC)>; + } } -defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, - SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; - -defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, - SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; - -def : Pat<(ffloor FR32X:$src), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>; -def : Pat<(f64 (ffloor FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>; -def : Pat<(f32 (fnearbyint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>; -def : Pat<(f64 (fnearbyint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>; -def : Pat<(f32 (fceil FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>; -def : Pat<(f64 (fceil FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>; -def : Pat<(f32 (frint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>; -def : Pat<(f64 (frint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>; -def : Pat<(f32 (ftrunc FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>; -def : Pat<(f64 (ftrunc FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>; +defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; + +let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; def : Pat<(v16f32 (fnearbyint VR512:$src)), @@ -4590,7 +4760,7 @@ def : Pat<(v8f64 (frint VR512:$src)), (VRNDSCALEPDZr VR512:$src, (i32 0x4))>; def : Pat<(v8f64 (ftrunc VR512:$src)), (VRNDSCALEPDZr VR512:$src, (i32 0x3))>; - +} //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- @@ -4600,32 +4770,32 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, RegisterClass KRC, X86MemOperand x86memop> { def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins srcRC:$src), - !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, EVEX; def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins KRC:$mask, srcRC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins KRC:$mask, srcRC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def mrk : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), []>, EVEX, EVEX_K; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, +defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; @@ -4679,151 +4849,158 @@ multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC, def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX; def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>, EVEX; def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; } } defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + loadv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + loadv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + loadv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + loadv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + loadv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + loadv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + loadv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + loadv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + loadv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + loadv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations -multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand memop> { -let mayLoad = 1, +multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag GatherNode> { +let mayLoad = 1, hasTwoExplicitDefs = 1, Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), - (ins RC:$src1, KRC:$mask, memop:$src2), + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), + (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [(set _.RC:$dst, _.KRCWM:$mask_wb, + (_.VT (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, + vectoraddr:$src2)))]>, EVEX, EVEX_K, + EVEX_CD8<_.EltSize, CD8VT1>; } let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, + mgatherv8i32>, EVEX_V512, VEX_W; +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, + mgatherv8i64>, EVEX_V512, VEX_W; } let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, + mgatherv16i32>, EVEX_V512; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, + mgatherv8i64>, EVEX_V512; } - -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; - -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; - -multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand memop> { + +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, + mgatherv8i32>, EVEX_V512, VEX_W; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, + mgatherv16i32>, EVEX_V512; + +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, + mgatherv8i64>, EVEX_V512, VEX_W; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, + mgatherv8i64>, EVEX_V512; + +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag ScatterNode> { + let mayStore = 1, Constraints = "$mask = $mask_wb" in - def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), - (ins memop:$dst, KRC:$mask, RC:$src2), + + def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), + (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), + _.KRCWM:$mask, vectoraddr:$dst))]>, + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } let ExeDomain = SSEPackedDouble in { -defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem, + mscatterv8i32>, EVEX_V512, VEX_W; +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem, + mscatterv8i64>, EVEX_V512, VEX_W; } let ExeDomain = SSEPackedSingle in { -defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem, + mscatterv16i32>, EVEX_V512; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem, + mscatterv8i64>, EVEX_V512; } -defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem, + mscatterv8i32>, EVEX_V512, VEX_W; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem, + mscatterv16i32>, EVEX_V512; -defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem, + mscatterv8i64>, EVEX_V512, VEX_W; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem, + mscatterv8i64>, EVEX_V512; // prefetch multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, RegisterClass KRC, X86MemOperand memop> { let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), - !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>, EVEX, EVEX_K; } @@ -4838,7 +5015,7 @@ defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; - + defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; @@ -4881,41 +5058,41 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string OpcodeStr, PatFrag mem_frag, Domain d> { def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffle]>; } -defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32, +defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32, SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64, +defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64, SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v16i32 (X86Shufp VR512:$src1, - (memopv16i32 addr:$src2), (i8 imm:$imm))), + (loadv16i32 addr:$src2), (i8 imm:$imm))), (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, - (memopv8i64 addr:$src2), (i8 imm:$imm))), + (loadv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; multiclass avx512_valign<X86VectorVTInfo _> { defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), "valign"##_.Suffix, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, @@ -4928,9 +5105,9 @@ multiclass avx512_valign<X86VectorVTInfo _> { let mayLoad = 1 in def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), !strconcat("valign"##_.Suffix, - " \t{$src3, $src2, $src1, $dst|" + "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } @@ -4946,43 +5123,43 @@ multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst|$dst, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B; def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B, EVEX_K; def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B, EVEX_KZ; @@ -5012,57 +5189,65 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPABSQZrr VR512:$src)>; -multiclass avx512_conflict<bits<8> opc, string OpcodeStr, +multiclass avx512_conflict<bits<8> opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { + let hasSideEffects = 0 in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, ${dst} |${dst}, $src}"), + !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), []>, EVEX; + let mayLoad = 1 in def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, ${dst}|${dst}, $src}"), + !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), []>, EVEX; + let mayLoad = 1 in def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B; def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; + let mayLoad = 1 in def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; + let mayLoad = 1 in def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_KZ, EVEX_B; - + let Constraints = "$src1 = $dst" in { def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; + let mayLoad = 1 in def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; + let mayLoad = 1 in def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), []>, EVEX, EVEX_K, EVEX_B; - } + } + } } let Predicates = [HasCDI] in { @@ -5109,11 +5294,11 @@ def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, (VPLZCNTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; -def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))), +def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))), (VPLZCNTDrm addr:$src)>; def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))), +def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))), (VPLZCNTQrm addr:$src)>; def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), (VPLZCNTQrr VR512:$src)>; @@ -5123,7 +5308,14 @@ def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; def : Pat<(store VK1:$src, addr:$dst), - (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>; + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(store VK8:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), (truncstore node:$val, node:$ptr), [{ @@ -5135,10 +5327,10 @@ def : Pat<(truncstorei1 GR8:$src, addr:$dst), multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), - !strconcat(OpcodeStr##Vec.Suffix, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; } - + multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { let Predicates = [prd] in @@ -5160,5 +5352,108 @@ multiclass avx512_convert_mask_to_vector<string OpcodeStr> { defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, HasDQI>, VEX_W; } - + defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// +multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, + _.ImmAllZerosV)))]>, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, + _.RC:$src0)))]>, EVEX_K; + + let mayStore = 1 in { + def mrk : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(store (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, undef)), + addr:$dst)]>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + +// expand +multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, (_.VT _.RC:$src), + _.ImmAllZerosV)))]>, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT _.RC:$src), _.RC:$src0)))]>, EVEX_K; + + let mayLoad = 1, Constraints = "$src0 = $dst" in + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT (bitconvert + (_.LdFrag addr:$src))), + _.RC:$src0)))]>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + + let mayLoad = 1 in + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT (bitconvert (_.LdFrag addr:$src))), + _.ImmAllZerosV)))]>, + EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + +} + +multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 25e1e80..78efc4d 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -15,13 +15,13 @@ //===----------------------------------------------------------------------===// // LEA - Load Effective Address let SchedRW = [WriteLEA] in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def LEA16r : I<0x8D, MRMSrcMem, - (outs GR16:$dst), (ins i32mem:$src), + (outs GR16:$dst), (ins anymem:$src), "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16; let isReMaterializable = 1 in def LEA32r : I<0x8D, MRMSrcMem, - (outs GR32:$dst), (ins i32mem:$src), + (outs GR32:$dst), (ins anymem:$src), "lea{l}\t{$src|$dst}, {$dst|$src}", [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, OpSize32, Requires<[Not64BitMode]>; @@ -65,18 +65,18 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>; // AX,DX = AX*GR16 -let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in +let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), "mul{w}\t$src", [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>; // EAX,EDX = EAX*GR32 -let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), "mul{l}\t$src", [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/], IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>; // RAX,RDX = RAX*GR64 -let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), "mul{q}\t$src", [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/], @@ -91,7 +91,7 @@ def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), [(set AL, (mul AL, (loadi8 addr:$src))), (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>; // AX,DX = AX*[mem16] -let mayLoad = 1, neverHasSideEffects = 1 in { +let mayLoad = 1, hasSideEffects = 0 in { let Defs = [AX,DX,EFLAGS], Uses = [AX] in def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), "mul{w}\t$src", @@ -107,7 +107,7 @@ def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>; } -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { // AL,AH = AL*GR8 let Defs = [AL,EFLAGS,AX], Uses = [AL] in def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [], @@ -145,7 +145,7 @@ let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>; } -} // neverHasSideEffects +} // hasSideEffects let Defs = [EFLAGS] in { @@ -456,64 +456,29 @@ def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "inc{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))], IIC_UNARY_REG>; - -let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize16; +def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; + IIC_UNARY_REG>, OpSize32; def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))], IIC_UNARY_REG>; -} // isConvertibleToThreeAddress = 1, CodeSize = 1 - - -// In 64-bit mode, single byte INC and DEC cannot be encoded. -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { -// Can transform into LEA. -def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[In64BitMode]>; -def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "inc{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[In64BitMode]>; -def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "dec{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[In64BitMode]>; -def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "dec{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[In64BitMode]>; } // isConvertibleToThreeAddress = 1, CodeSize = 2 -let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - CodeSize = 2 in { -def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "inc{w}\t$dst", [], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "inc{l}\t$dst", [], IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; -def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "dec{w}\t$dst", [], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "dec{l}\t$dst", [], IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; -} // isCodeGenOnly = 1, ForceDisassemble = 1, HasSideEffects = 0, CodeSize = 2 - +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 } // Constraints = "$src1 = $dst", SchedRW let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { @@ -522,35 +487,13 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { (implicit EFLAGS)], IIC_UNARY_MEM>; def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; - -// These are duplicates of their 32-bit counterparts. Only needed so X86 knows -// how to unfold them. -// FIXME: What is this for?? -def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", - [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[In64BitMode]>; -def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", - [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[In64BitMode]>; -def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", - [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[In64BitMode]>; -def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", - [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[In64BitMode]>; } // CodeSize = 2, SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { @@ -559,21 +502,29 @@ def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "dec{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], IIC_UNARY_REG>; -let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + IIC_UNARY_REG>, OpSize16; +def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; + IIC_UNARY_REG>, OpSize32; def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))], IIC_UNARY_REG>; -} // CodeSize = 2 +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 } // Constraints = "$src1 = $dst", SchedRW @@ -583,12 +534,10 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { (implicit EFLAGS)], IIC_UNARY_MEM>; def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", [(store (add (loadi64 addr:$dst), -1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; @@ -710,15 +659,6 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALU]>; -// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has -// just a regclass (no eflags) as a result. -class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode> - : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], - IIC_BIN_NONMEM>; - // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has // just a EFLAGS as a result. class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, @@ -825,13 +765,6 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let ImmT = typeinfo.ImmEncoding; } -// BinOpRI_R - Instructions like "add reg, reg, imm". -class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; - // BinOpRI_F - Instructions like "cmp reg, imm". class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDPatternOperator opnode, Format f> @@ -864,30 +797,23 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let ImmT = Imm8; // Always 8-bit immediate. } -// BinOpRI8_R - Instructions like "add reg, reg, imm8". -class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; - // BinOpRI8_F - Instructions like "cmp reg, imm8". class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), [(set EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; // BinOpRI8_RF - Instructions like "add reg, reg, imm8". class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; // BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, @@ -923,8 +849,8 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; // BinOpMI - Instructions like "add [mem], imm". -class BinOpMI<string mnemonic, X86TypeInfo typeinfo, - Format f, list<dag> pattern, bits<8> opcode = 0x80, +class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM> : ITy<opcode, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), @@ -934,27 +860,26 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo, } // BinOpMI_RMW - Instructions like "add [mem], imm". -class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, +class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, + : BinOpMI<opcode, mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>; // BinOpMI_RMW_FF - Instructions like "adc [mem], imm". -class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, +class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), - (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>; + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; // BinOpMI_F - Instructions like "cmp [mem], imm". -class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, - SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> - : BinOpMI<mnemonic, typeinfo, f, +class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), - typeinfo.ImmOperator:$src))], - opcode>; + typeinfo.ImmOperator:$src))]>; // BinOpMI8 - Instructions like "add [mem], imm8". class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, @@ -969,7 +894,7 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_RMW - Instructions like "add [mem], imm8". class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(store (opnode (load addr:$dst), typeinfo.Imm8Operator:$src), addr:$dst), @@ -977,7 +902,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(store (opnode (load addr:$dst), typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), @@ -985,7 +910,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_F - Instructions like "cmp [mem], imm8". class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.Imm8Operator:$src))]>; @@ -1023,12 +948,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, bit CommutableRR, bit ConvertibleToThreeAddress> { let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; - def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; - def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; - def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isConvertibleToThreeAddress } // isCommutable def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1041,6 +967,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1048,7 +976,6 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; - def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; @@ -1066,10 +993,20 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>; + } } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, @@ -1094,12 +1031,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, bit ConvertibleToThreeAddress> { let Uses = [EFLAGS], Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; - def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isConvertibleToThreeAddress } // isCommutable def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1112,6 +1050,8 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1119,7 +1059,6 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; - def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; @@ -1137,10 +1076,20 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>; + } } // Uses = [EFLAGS], Defs = [EFLAGS] def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, @@ -1162,12 +1111,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { let Defs = [EFLAGS] in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; - def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } } // isCommutable def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1180,6 +1130,8 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1187,7 +1139,6 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; - def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; @@ -1204,10 +1155,19 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1 in + def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>; + } } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, @@ -1272,15 +1232,15 @@ let isCompare = 1 in { def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; - def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; - def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; - def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; - def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; + def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>; + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the // register class is constrained to GR8_NOREX. This pseudo is explicitly // marked side-effect free, since it doesn't have an isel pattern like - // other test instructions. + // other test instructions. let isPseudo = 1, hasSideEffects = 0 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; @@ -1332,7 +1292,7 @@ let Predicates = [HasBMI] in { // MULX Instruction // multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), @@ -1355,19 +1315,19 @@ let Predicates = [HasBMI2] in { //===----------------------------------------------------------------------===// // ADCX Instruction // -let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS], +let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], Constraints = "$src0 = $dst", AddedComplexity = 10 in { let SchedRW = [WriteALU] in { def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))], - IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX]>; + IIC_BIN_CARRY_NONMEM>, T8PD; def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))], - IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX, In64BitMode]>; + IIC_BIN_CARRY_NONMEM>, T8PD; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { @@ -1375,37 +1335,34 @@ let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS], (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))], - IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX]>; + IIC_BIN_CARRY_MEM>, T8PD; def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))], - IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX, In64BitMode]>; + IIC_BIN_CARRY_MEM>, T8PD; } } //===----------------------------------------------------------------------===// // ADOX Instruction // -let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS] in { +let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS], + Uses = [EFLAGS] in { let SchedRW = [WriteALU] in { def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX]>; + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX, In64BitMode]>; + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS, Requires<[HasADX]>; + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS, Requires<[HasADX, In64BitMode]>; + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; } } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 117b6ff..18bbe5d 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -32,7 +32,7 @@ def GetLo8XForm : SDNodeXForm<imm, [{ // PIC base construction. This expands to code that looks like this: // call $next_inst // popl %destreg" -let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), "", []>; @@ -43,15 +43,18 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, + []>, Requires<[NotLP64]>; def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NotLP64]>; } +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; + // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into // a stack adjustment and the codegen must know that they may modify the stack @@ -59,16 +62,17 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, + []>, Requires<[IsLP64]>; def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[IsLP64]>; } - +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. @@ -259,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1, neverHasSideEffects = 1 in + isCodeGenOnly = 1, hasSideEffects = 0 in def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; @@ -471,59 +475,50 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions -// X86 doesn't have 8-bit conditional moves. Use a customInserter to -// emit control flow. An alternative to this is to mark i8 SELECT as Promote, -// however that requires promoting the operands, and can induce additional -// i8 register pressure. -let usesCustomInserter = 1, Uses = [EFLAGS] in { -def CMOV_GR8 : I<0, Pseudo, - (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), - "#CMOV_GR8 PSEUDO!", - [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, - imm:$cond, EFLAGS))]>; - -let Predicates = [NoCMov] in { -def CMOV_GR32 : I<0, Pseudo, - (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), - "#CMOV_GR32* PSEUDO!", - [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; -def CMOV_GR16 : I<0, Pseudo, - (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), - "#CMOV_GR16* PSEUDO!", - [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; -} // Predicates = [NoCMov] - -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE1. -let Predicates = [FPStackf32] in -def CMOV_RFP32 : I<0, Pseudo, - (outs RFP32:$dst), - (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), - "#CMOV_RFP32 PSEUDO!", - [(set RFP32:$dst, - (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, - EFLAGS))]>; -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE2. -let Predicates = [FPStackf64] in -def CMOV_RFP64 : I<0, Pseudo, - (outs RFP64:$dst), - (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), - "#CMOV_RFP64 PSEUDO!", - [(set RFP64:$dst, - (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, - EFLAGS))]>; -def CMOV_RFP80 : I<0, Pseudo, - (outs RFP80:$dst), - (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), - "#CMOV_RFP80 PSEUDO!", - [(set RFP80:$dst, - (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, - EFLAGS))]>; -} // UsesCustomInserter = 1, Uses = [EFLAGS] +// CMOV* - Used to implement the SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { + def CMOV#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), + "#CMOV_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + EFLAGS)))]>; +} +let usesCustomInserter = 1, Uses = [EFLAGS] in { + // X86 doesn't have 8-bit conditional moves. Use a customInserter to + // emit control flow. An alternative to this is to mark i8 SELECT as Promote, + // however that requires promoting the operands, and can induce additional + // i8 register pressure. + defm _GR8 : CMOVrr_PSEUDO<GR8, i8>; + + let Predicates = [NoCMov] in { + defm _GR32 : CMOVrr_PSEUDO<GR32, i32>; + defm _GR16 : CMOVrr_PSEUDO<GR16, i16>; + } // Predicates = [NoCMov] + + // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no + // SSE1/SSE2. + let Predicates = [FPStackf32] in + defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>; + + let Predicates = [FPStackf64] in + defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>; + + defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; + + defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; + defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; + defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; + defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>; + defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>; + defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>; + defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>; + defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>; + defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>; +} // usesCustomInserter = 1, Uses = [EFLAGS] //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions @@ -600,12 +595,12 @@ def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize32, LOCK; -def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), - !strconcat(mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, @@ -859,79 +854,6 @@ def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; -//===----------------------------------------------------------------------===// -// Conditional Move Pseudo Instructions. -//===----------------------------------------------------------------------===// - -// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after -// instruction selection into a branch sequence. -let Uses = [EFLAGS], usesCustomInserter = 1 in { - def CMOV_FR32 : I<0, Pseudo, - (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), - "#CMOV_FR32 PSEUDO!", - [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, - EFLAGS))]>; - def CMOV_FR64 : I<0, Pseudo, - (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), - "#CMOV_FR64 PSEUDO!", - [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, - EFLAGS))]>; - def CMOV_V4F32 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V4F32 PSEUDO!", - [(set VR128:$dst, - (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2F64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2F64 PSEUDO!", - [(set VR128:$dst, - (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2I64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2I64 PSEUDO!", - [(set VR128:$dst, - (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F32 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V8F32 PSEUDO!", - [(set VR256:$dst, - (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4F64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4F64 PSEUDO!", - [(set VR256:$dst, - (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4I64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4I64 PSEUDO!", - [(set VR256:$dst, - (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8I64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8I64 PSEUDO!", - [(set VR512:$dst, - (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8F64 PSEUDO!", - [(set VR512:$dst, - (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V16F32 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V16F32 PSEUDO!", - [(set VR512:$dst, - (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; -} - //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules @@ -1010,6 +932,9 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV64mi32 addr:$dst, tblockaddress:$src)>, Requires<[NearData, IsStatic]>; +def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; + // Calls // tls has some funny stuff here... @@ -1058,12 +983,12 @@ def : Pat<(X86tcret (load addr:$dst), imm:$off), Requires<[Not64BitMode, IsNotPIC]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), - (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + (TCRETURNdi tglobaladdr:$dst, imm:$off)>, + Requires<[NotLP64]>; def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, @@ -1077,11 +1002,11 @@ def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), (TCRETURNdi64 texternalsym:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; // Normal calls, with various flavors of addresses. def : Pat<(X86call (i32 tglobaladdr:$dst)), @@ -1556,8 +1481,12 @@ def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; // Helper imms that check if a mask doesn't change significant shift bits. -def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; -def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; +def immShift32 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 5; +}]>; +def immShift64 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 6; +}]>; // Shift amount is implicitly masked. multiclass MaskedShiftAmountPats<SDNode frag, string name> { @@ -1724,35 +1653,18 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; -// Increment reg. -// Do not make INC if it is slow -def : Pat<(add GR8:$src, 1), - (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>; -def : Pat<(add GR16:$src, 1), - (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR16:$src, 1), - (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR32:$src, 1), - (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR32:$src, 1), - (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR64:$src, 1), - (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>; - -// Decrement reg. -// Do not make DEC if it is slow -def : Pat<(add GR8:$src, -1), - (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>; -def : Pat<(add GR16:$src, -1), - (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR16:$src, -1), - (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR32:$src, -1), - (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR32:$src, -1), - (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR64:$src, -1), - (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>; +// Increment/Decrement reg. +// Do not make INC/DEC if it is slow +let Predicates = [NotSlowIncDec] in { + def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; + def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; + def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; + def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; + def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; + def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; + def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +} // or reg/reg. def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 39ad395..6ab961f 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -57,33 +57,32 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, // Unconditional branches. let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { - def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize32; - def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize16, - Requires<[In16BitMode]>; - let hasSideEffects = 0 in def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), - "jmp\t$dst", [], IIC_JMP_REL>; + "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize16; + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize32; + } } // Conditional Branches. let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { - let hasSideEffects = 0 in - def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], - IIC_Jcc>; - def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, OpSize16, - TB, Requires<[In16BitMode]>; - def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB, - OpSize32; + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm, + [], IIC_Jcc>, OpSize16, TB; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm, + [], IIC_Jcc>, TB, OpSize32; + } } } defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; -defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; @@ -106,20 +105,14 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in // jecxz. let Uses = [CX] in def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[Not64BitMode]>; + "jcxz\t$dst", [], IIC_JCXZ>, AdSize16; let Uses = [ECX] in - def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", [], IIC_JCXZ>, Requires<[Not64BitMode]>; + def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", [], IIC_JCXZ>, AdSize32; - // J*CXZ instruction: 64-bit versions of this instruction for the asmparser. - // In 64-bit mode, the address size prefix is jecxz and the unprefixed version - // is jrcxz. - let Uses = [ECX] in - def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>; let Uses = [RCX] in def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>; + "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64; } // Indirect branches @@ -145,14 +138,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; - def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), - (ins i16imm:$off, i16imm:$seg), - "ljmp{w}\t{$seg, $off|$off, $seg}", [], - IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>; - def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), - (ins i32imm:$off, i16imm:$seg), - "ljmp{l}\t{$seg, $off|$off, $seg}", [], - IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + let Predicates = [Not64BitMode] in { + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, Sched<[WriteJump]>; @@ -186,10 +181,11 @@ let isCall = 1 in (outs), (ins i32imm_pcrel:$dst), "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; - def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, - (outs), (ins i16imm_pcrel:$dst), - "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16, - Sched<[WriteJump]>; + let hasSideEffects = 0 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst), + "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16, + Sched<[WriteJump]>; def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst), "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>, OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>; @@ -207,14 +203,16 @@ let isCall = 1 in Requires<[Not64BitMode,FavorMemIndirectCall]>, Sched<[WriteJumpLd]>; - def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), - (ins i16imm:$off, i16imm:$seg), - "lcall{w}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>; - def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), - (ins i32imm:$off, i16imm:$seg), - "lcall{l}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + let Predicates = [Not64BitMode] in { + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16, @@ -242,13 +240,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // mcinst. def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), (ins i32imm_pcrel:$dst), - "jmp\t$dst # TAILCALL", + "jmp\t$dst", [], IIC_JMP_REL>; def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), - "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; } @@ -280,17 +278,6 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; } -let isCall = 1, isCodeGenOnly = 1 in - // __chkstk(MSVC): clobber R10, R11 and EFLAGS. - // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. - let Defs = [RAX, R10, R11, RSP, EFLAGS], - Uses = [RSP] in { - def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst), - "call{q}\t$dst", [], IIC_CALL_RI>, - Requires<[IsWin64]>, Sched<[WriteJump]>; - } - let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, SchedRW = [WriteJump] in { @@ -303,13 +290,25 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURNmi64 : PseudoI<(outs), (ins i64mem_TC:$dst, i32imm:$offset), []>; - def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst), - "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), + "jmp\t$dst", [], IIC_JMP_REL>; def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; let mayLoad = 1 in def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + // Win64 wants jumps leaving the function to have a REX_W prefix. + let hasREX_WPrefix = 1 in { + def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst), + "rex64 jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + let mayLoad = 1 in + def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + } } diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index b38129a..c4b2d6d 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { let Defs = [AX], Uses = [AL] in def CBW : I<0x98, RawFrm, (outs), (ins), "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL) @@ -39,7 +39,7 @@ let neverHasSideEffects = 1 in { // Sign/Zero extenders -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, TB, OpSize16, Sched<[WriteALU]>; @@ -47,7 +47,7 @@ let mayLoad = 1 in def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, TB, OpSize16, Sched<[WriteALULd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, @@ -65,7 +65,7 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, OpSize32, TB, Sched<[WriteALULd]>; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, TB, OpSize16, Sched<[WriteALU]>; @@ -73,7 +73,7 @@ let mayLoad = 1 in def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, TB, OpSize16, Sched<[WriteALULd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, @@ -94,7 +94,7 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), // These are the same as the regular MOVZX32rr8 and MOVZX32rm8 // except that they use GR32_NOREX for the output operand register class // instead of GR32. This allows them to operate on h registers on x86-64. -let neverHasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 0, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", @@ -139,11 +139,11 @@ def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>, - Sched<[WriteALU]>; + Sched<[WriteALU]>, Requires<[In64BitMode]>; def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>, - Sched<[WriteALULd]>; + Sched<[WriteALULd]>, Requires<[In64BitMode]>; // movzbq and movzwq encodings for the disassembler def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index c0a6864..2993e42 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -69,7 +69,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, /* IsRVariantCommutable */ 1, /* IsMVariantCommutable */ 1, Op>; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; @@ -81,7 +81,7 @@ let neverHasSideEffects = 1 in { MemFrag128, MemFrag256, OpTy128, OpTy256, /* IsRVariantCommutable */ 1, /* IsMVariantCommutable */ 0>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 } // Fused Multiply-Add @@ -155,7 +155,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, SDNode OpNode, RegisterClass RC, ValueType OpVT, X86MemOperand x86memop, Operand memop, PatFrag mem_frag, ComplexPattern mem_cpat> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC, OpVT, mem_frag>; // See the other defm of r231 for the explanation regarding the diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index d9f173e..6cd5e79 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -17,13 +17,13 @@ // FPStack specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, SDTCisVT<1, f80>]>; def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, - SDTCisPtrTy<1>, + SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, - SDTCisPtrTy<1>, + SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; @@ -98,7 +98,7 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // All FP Stack operations are represented with four instructions here. The // first three instructions, generated by the instruction selector, use "RFP32" // "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, -// 64-bit or 80-bit floating point values. These sizes apply to the values, +// 64-bit or 80-bit floating point values. These sizes apply to the values, // not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be // copied to each other without losing information. These instructions are all // pseudo instructions and use the "_Fp" suffix. @@ -107,7 +107,7 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // The second instruction is defined with FPI, which is the actual instruction // emitted by the assembler. These use "RST" registers, although frequently // the actual register(s) used are implicit. These are always 80 bits. -// The FP stackifier pass converts one to the other after register allocation +// The FP stackifier pass converts one to the other after register allocation // occurs. // // Note that the FpI instruction should have instruction selection info (e.g. @@ -139,66 +139,66 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // These instructions cannot address 80-bit memory. multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { // ST(0) = ST(0) + [mem] -def _Fp32m : FpIf32<(outs RFP32:$dst), +def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, + [(set RFP32:$dst, (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; -def _Fp64m : FpIf64<(outs RFP64:$dst), +def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, + [(set RFP64:$dst, (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; -def _Fp64m32: FpIf64<(outs RFP64:$dst), +def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, + [(set RFP64:$dst, (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; -def _Fp80m32: FpI_<(outs RFP80:$dst), +def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, + [(set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; -def _Fp80m64: FpI_<(outs RFP80:$dst), +def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, + [(set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; -def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")> { + let mayLoad = 1; } -def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")> { + let mayLoad = 1; } // ST(0) = ST(0) + [memint] -def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, [(set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, [(set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i32)))]>; -def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, [(set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, [(set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i32)))]>; -def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW, [(set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW, [(set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i32)))]>; -def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")> { + let mayLoad = 1; } -def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")> { + let mayLoad = 1; } } @@ -282,7 +282,7 @@ defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; defm SIN : FPUnary<fsin, MRM_FE, "fsin">; defm COS : FPUnary<fcos, MRM_FF, "fcos">; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; @@ -415,7 +415,7 @@ def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, [(truncstoref64 RFP80:$src, addr:$op)]>; // FST does not support 80-bit memory target; FSTP must be used. -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; @@ -424,7 +424,7 @@ def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; } def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, [(store RFP80:$src, addr:$op)]>; -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; @@ -500,7 +500,7 @@ def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", IIC_FST>; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", IIC_FST>; -def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst", IIC_FST>; } @@ -636,12 +636,12 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), "fxsave\t$dst", [], IIC_FXSAVE>, TB; def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, + "fxsave64\t$dst", [], IIC_FXSAVE>, TB, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor\t$src", [], IIC_FXRSTOR>, TB; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor{q|64}\t$src", [], IIC_FXRSTOR>, TB, + "fxrstor64\t$src", [], IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; } // SchedRW @@ -656,12 +656,12 @@ def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>; def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>; def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>; diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index fe4ead1..56043fb 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -34,23 +34,27 @@ def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; def MRM6m : Format<30>; def MRM7m : Format<31>; def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; -def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>; -def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>; -def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>; -def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>; -def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>; -def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>; -def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>; -def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>; -def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>; -def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>; -def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>; -def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>; -def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>; -def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>; -def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>; -def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>; -def MRM_FE : Format<83>; def MRM_FF : Format<84>; +def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>; +def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>; +def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>; +def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>; +def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>; +def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>; +def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>; +def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>; +def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>; +def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>; +def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>; +def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>; +def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>; +def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>; +def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>; +def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>; +def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>; +def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>; +def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>; +def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>; +def MRM_FF : Format<95>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -146,11 +150,22 @@ def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. +// Address size for encodings that change based on mode. +class AddressSize<bits<2> val> { + bits<2> Value = val; +} +def AdSizeX : AddressSize<0>; // Address size determined using addr operand. +def AdSize16 : AddressSize<1>; // Encodes a 16-bit address. +def AdSize32 : AddressSize<2>; // Encodes a 32-bit address. +def AdSize64 : AddressSize<3>; // Encodes a 64-bit address. + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize16 { OperandSize OpSize = OpSize16; } class OpSize32 { OperandSize OpSize = OpSize32; } -class AdSize { bit hasAdSizePrefix = 1; } +class AdSize16 { AddressSize AdSize = AdSize16; } +class AdSize32 { AddressSize AdSize = AdSize32; } +class AdSize64 { AddressSize AdSize = AdSize64; } class REX_W { bit hasREX_WPrefix = 1; } class LOCK { bit hasLockPrefix = 1; } class REP { bit hasREPPrefix = 1; } @@ -231,9 +246,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // AsmString from the parser, but still disassemble. OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change - // based on operand size of the mode + // based on operand size of the mode? bits<2> OpSizeBits = OpSize.Value; - bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? + AddressSize AdSize = AdSizeX; // Does this instruction's encoding change + // based on address size of the mode? + bits<2> AdSizeBits = AdSize.Value; Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have? bits<3> OpPrefixBits = OpPrefix.Value; @@ -284,35 +301,35 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); - // TSFlags layout should be kept in sync with X86InstrInfo.h. + // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; - let TSFlags{9} = hasAdSizePrefix; - let TSFlags{12-10} = OpPrefixBits; - let TSFlags{15-13} = OpMapBits; - let TSFlags{16} = hasREX_WPrefix; - let TSFlags{20-17} = ImmT.Value; - let TSFlags{23-21} = FPForm.Value; - let TSFlags{24} = hasLockPrefix; - let TSFlags{25} = hasREPPrefix; - let TSFlags{27-26} = ExeDomain.Value; - let TSFlags{29-28} = OpEncBits; - let TSFlags{37-30} = Opcode; - let TSFlags{38} = hasVEX_WPrefix; - let TSFlags{39} = hasVEX_4V; - let TSFlags{40} = hasVEX_4VOp3; - let TSFlags{41} = hasVEX_i8ImmReg; - let TSFlags{42} = hasVEX_L; - let TSFlags{43} = ignoresVEX_L; - let TSFlags{44} = hasEVEX_K; - let TSFlags{45} = hasEVEX_Z; - let TSFlags{46} = hasEVEX_L2; - let TSFlags{47} = hasEVEX_B; + let TSFlags{10-9} = AdSizeBits; + let TSFlags{13-11} = OpPrefixBits; + let TSFlags{16-14} = OpMapBits; + let TSFlags{17} = hasREX_WPrefix; + let TSFlags{21-18} = ImmT.Value; + let TSFlags{24-22} = FPForm.Value; + let TSFlags{25} = hasLockPrefix; + let TSFlags{26} = hasREPPrefix; + let TSFlags{28-27} = ExeDomain.Value; + let TSFlags{30-29} = OpEncBits; + let TSFlags{38-31} = Opcode; + let TSFlags{39} = hasVEX_WPrefix; + let TSFlags{40} = hasVEX_4V; + let TSFlags{41} = hasVEX_4VOp3; + let TSFlags{42} = hasVEX_i8ImmReg; + let TSFlags{43} = hasVEX_L; + let TSFlags{44} = ignoresVEX_L; + let TSFlags{45} = hasEVEX_K; + let TSFlags{46} = hasEVEX_Z; + let TSFlags{47} = hasEVEX_L2; + let TSFlags{48} = hasEVEX_B; // If we run out of TSFlags bits, it's possible to encode this in 3 bits. - let TSFlags{54-48} = CD8_Scale; - let TSFlags{55} = has3DNow0F0FOpcode; - let TSFlags{56} = hasMemOp4Prefix; - let TSFlags{57} = hasEVEX_RC; + let TSFlags{55-49} = CD8_Scale; + let TSFlags{56} = has3DNow0F0FOpcode; + let TSFlags{57} = hasMemOp4Prefix; + let TSFlags{58} = hasEVEX_RC; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -327,26 +344,26 @@ class I<bits<8> o, Format f, dag outs, dag ins, string asm, let Pattern = pattern; let CodeSize = 3; } -class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary, Domain d = GenericDomain> : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } -class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; @@ -359,14 +376,14 @@ class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } -class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { let Pattern = pattern; @@ -393,14 +410,14 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, // Iseg16 - 16-bit segment selector, 16-bit offset // Iseg32 - 16-bit segment selector, 32-bit offset -class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; @@ -409,8 +426,9 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = NoItinerary> - : I<o, F, outs, ins, asm, pattern, itin> { + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], @@ -478,7 +496,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, } // SSE1 Instruction Templates: -// +// // SSI - SSE1 instructions with XS prefix. // PSI - SSE1 instructions with PS prefix. // PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix. @@ -509,7 +527,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasAVX]>; // SSE2 Instruction Templates: -// +// // SDI - SSE2 instructions with XD prefix. // SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. // S2SI - SSE2 instructions with XS prefix. @@ -573,16 +591,16 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; // SSE3 Instruction Templates: -// +// // S3I - SSE3 instructions with PD prefixes. // S3SI - SSE3 instructions with XS prefix. // S3DI - SSE3 instructions with XD prefix. -class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, Requires<[UseSSE3]>; -class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, Requires<[UseSSE3]>; @@ -593,7 +611,7 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, // SSSE3 Instruction Templates: -// +// // SS38I - SSSE3 instructions with T8 prefix. // SS3AI - SSSE3 instructions with TA prefix. // MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. @@ -621,7 +639,7 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: -// +// // SS48I - SSE 4.1 instructions with T8 prefix. // SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. // @@ -635,7 +653,7 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[UseSSE41]>; // SSE4.2 Instruction Templates: -// +// // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> @@ -699,6 +717,9 @@ class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, Requires<[HasAVX512]>; +class AVX5128IBase : T8PD { + Domain ExeDomain = SSEPackedInt; +} class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, @@ -868,27 +889,27 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. // MMXID - MMX instructions with XD prefix. // MMXIS - MMX instructions with XS prefix. -class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; -class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>; -class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>; -class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>; -class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>; -class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; -class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; -class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 1c7215c..bf515a8 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -12,10 +12,23 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// MMX specific DAG Nodes. +//===----------------------------------------------------------------------===// + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +// GPR to low word of MMX. +def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>; + +//===----------------------------------------------------------------------===// // MMX Pattern Fragments //===----------------------------------------------------------------------===// def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; +def load_mvmmx : PatFrag<(ops node:$ptr), + (x86mmx (MMX_X86movw2d (load node:$ptr)))>; def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; //===----------------------------------------------------------------------===// @@ -201,10 +214,19 @@ def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; +def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; + def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<2>]>; +def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<3>]>; +def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; @@ -256,6 +278,11 @@ def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; +def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; + def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; @@ -263,9 +290,22 @@ def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>; +def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>; +def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>; +def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; +def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; +def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; + def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; -def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; +def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; + +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; +def X86mgather : SDNode<"X86ISD::GATHER", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -278,6 +318,13 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -334,6 +381,15 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +// These are needed to match a scalar load that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def loadf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; +def loadf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -412,20 +468,10 @@ def alignedloadv8i64 : PatFrag<(ops node:$ptr), // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() + return Subtarget->hasSSEUnalignedMem() || cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; -def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() - || cast<LoadSDNode>(N)->getAlignment() >= 4; -}]>; - -def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() - || cast<LoadSDNode>(N)->getAlignment() >= 8; -}]>; - def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; @@ -435,17 +481,15 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -// 256-bit memop pattern fragments -// NOTE: all 256-bit integer vector loads are promoted to v4i64 -def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; -def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; -def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; -// 512-bit memop pattern fragments -def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; -def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; -def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. @@ -482,6 +526,58 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; +def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + // return (Mgt->getIndex().getValueType() == MVT::v8i32 || + // Mgt->getBasePtr().getValueType() == MVT::v8i32); + //return false; + return N != 0; +}]>; + +def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + // return (Mgt->getIndex().getValueType() == MVT::v8i64 || + // Mgt->getBasePtr().getValueType() == MVT::v8i64); + //return false; + return N != 0; +}]>; +def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + // return (Mgt->getIndex().getValueType() == MVT::v16i32 || + // Mgt->getBasePtr().getValueType() == MVT::v16i32); + //return false; + return N != 0; +}]>; + +def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + // return (Sc->getIndex().getValueType() == MVT::v8i32 || + // Sc->getBasePtr().getValueType() == MVT::v8i32); + //return false; + return N != 0; +}]>; + +def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + // return (Sc->getIndex().getValueType() == MVT::v8i64 || + // Sc->getBasePtr().getValueType() == MVT::v8i64); + //return false; + return N != 0; +}]>; +def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + //if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + // return (Sc->getIndex().getValueType() == MVT::v16i32 || + // Sc->getBasePtr().getValueType() == MVT::v16i32); + //return false; + return N != 0; +}]>; + // 128-bit bitconvert pattern fragments def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 7f87bdd..f5b9680 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -65,6 +65,7 @@ enum { TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, + TB_INDEX_4 = 4, TB_INDEX_MASK = 0xf, // Do not insert the reverse map (MemOp -> RegOp) into the table. @@ -90,7 +91,7 @@ enum { TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; -struct X86OpTblEntry { +struct X86MemoryFoldTableEntry { uint16_t RegOp; uint16_t MemOp; uint16_t Flags; @@ -105,7 +106,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), Subtarget(STI), RI(STI) { - static const X86OpTblEntry OpTbl2Addr[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, { X86::ADC32ri8, X86::ADC32mi8, 0 }, { X86::ADC32rr, X86::ADC32mr, 0 }, @@ -145,14 +146,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::AND8rr, X86::AND8mr, 0 }, { X86::DEC16r, X86::DEC16m, 0 }, { X86::DEC32r, X86::DEC32m, 0 }, - { X86::DEC64_16r, X86::DEC64_16m, 0 }, - { X86::DEC64_32r, X86::DEC64_32m, 0 }, { X86::DEC64r, X86::DEC64m, 0 }, { X86::DEC8r, X86::DEC8m, 0 }, { X86::INC16r, X86::INC16m, 0 }, { X86::INC32r, X86::INC32m, 0 }, - { X86::INC64_16r, X86::INC64_16m, 0 }, - { X86::INC64_32r, X86::INC64_32m, 0 }, { X86::INC64r, X86::INC64m, 0 }, { X86::INC8r, X86::INC8m, 0 }, { X86::NEG16r, X86::NEG16m, 0 }, @@ -272,17 +269,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR8rr, X86::XOR8mr, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { - unsigned RegOp = OpTbl2Addr[i].RegOp; - unsigned MemOp = OpTbl2Addr[i].MemOp; - unsigned Flags = OpTbl2Addr[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) { + unsigned RegOp = MemoryFoldTable2Addr[i].RegOp; + unsigned MemOp = MemoryFoldTable2Addr[i].MemOp; + unsigned Flags = MemoryFoldTable2Addr[i].Flags; AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, RegOp, MemOp, // Index 0, folded load and store, no alignment requirement. Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } - static const X86OpTblEntry OpTbl0[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, @@ -336,6 +333,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, + { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, + { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -354,10 +353,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, + { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + // AVX 128-bit versions of foldable instructions { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -370,6 +371,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, + { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE }, + { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE }, + // AVX 256-bit foldable instructions { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -377,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, @@ -389,6 +394,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -400,6 +406,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (128-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -410,18 +417,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, + + // F16C foldable instructions + { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, + { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } }; - for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { - unsigned RegOp = OpTbl0[i].RegOp; - unsigned MemOp = OpTbl0[i].MemOp; - unsigned Flags = OpTbl0[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) { + unsigned RegOp = MemoryFoldTable0[i].RegOp; + unsigned MemOp = MemoryFoldTable0[i].MemOp; + unsigned Flags = MemoryFoldTable0[i].Flags; AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, RegOp, MemOp, TB_INDEX_0 | Flags); } - static const X86OpTblEntry OpTbl1[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::CMP16rr, X86::CMP16rm, 0 }, { X86::CMP32rr, X86::CMP32rm, 0 }, { X86::CMP64rr, X86::CMP64rm, 0 }, @@ -448,9 +459,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, @@ -490,11 +504,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, + { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, + { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, + { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, + { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, + { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 }, + { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 }, + { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 }, + { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 }, + { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 }, + { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 }, + { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 }, + { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 }, + { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 }, + { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 }, + { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 }, + { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, + { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, + { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, + { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, @@ -512,6 +546,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 }, + { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, + { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 }, + { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 }, + { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 }, + { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, + { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 }, + { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 }, + { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 }, + { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 }, + // AVX 128-bit versions of foldable instructions { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, @@ -529,9 +576,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, + { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, @@ -542,8 +592,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, - { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, - { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, + { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, + { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, @@ -551,50 +601,151 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, + { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, + { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, + { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, + { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, + { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 }, + { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 }, + { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 }, + { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 }, + { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 }, + { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 }, + { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 }, + { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 }, + { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 }, + { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 }, + { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 }, + { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 }, { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, + { X86::VPTESTrr, X86::VPTESTrm, 0 }, { X86::VRCPPSr, X86::VRCPPSm, 0 }, { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, + { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, + { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, + { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, + { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, - { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, + { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, + { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, + { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + { X86::VPTESTYrr, X86::VPTESTYrm, 0 }, { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, + { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, 0 }, { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, + { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, + { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, // AVX2 foldable instructions + + // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the + // VBROADCASTS{SD}rm memory instructions were available from AVX1. + // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction + // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions + // so they don't need an equivalent limitation. + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, + { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 }, + { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 }, + { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 }, + { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 }, + { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 }, + { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 }, + { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 }, + { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, + { X86::VPERMQYri, X86::VPERMQYmi, 0 }, + { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 }, + { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 }, + { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, + { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, + { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, + { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 }, + { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 }, + { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 }, + { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, + { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, + { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, + { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 }, { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, + // XOP foldable instructions + { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, + { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 }, + { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 }, + { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 }, + { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 }, + { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 }, + { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 }, + { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 }, + { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 }, + { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 }, + { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 }, + { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 }, + { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 }, + { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 }, + { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 }, + { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 }, + { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 }, + { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 }, + { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 }, + { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 }, + { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 }, + { X86::VPROTBri, X86::VPROTBmi, 0 }, + { X86::VPROTBrr, X86::VPROTBmr, 0 }, + { X86::VPROTDri, X86::VPROTDmi, 0 }, + { X86::VPROTDrr, X86::VPROTDmr, 0 }, + { X86::VPROTQri, X86::VPROTQmi, 0 }, + { X86::VPROTQrr, X86::VPROTQmr, 0 }, + { X86::VPROTWri, X86::VPROTWmi, 0 }, + { X86::VPROTWrr, X86::VPROTWmr, 0 }, + { X86::VPSHABrr, X86::VPSHABmr, 0 }, + { X86::VPSHADrr, X86::VPSHADmr, 0 }, + { X86::VPSHAQrr, X86::VPSHAQmr, 0 }, + { X86::VPSHAWrr, X86::VPSHAWmr, 0 }, + { X86::VPSHLBrr, X86::VPSHLBmr, 0 }, + { X86::VPSHLDrr, X86::VPSHLDmr, 0 }, + { X86::VPSHLQrr, X86::VPSHLQmr, 0 }, + { X86::VPSHLWrr, X86::VPSHLWmr, 0 }, + // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, @@ -659,6 +810,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, @@ -670,6 +824,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, @@ -681,25 +838,30 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + + // F16C foldable instructions + { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, + { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, - { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 } + { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { - unsigned RegOp = OpTbl1[i].RegOp; - unsigned MemOp = OpTbl1[i].MemOp; - unsigned Flags = OpTbl1[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) { + unsigned RegOp = MemoryFoldTable1[i].RegOp; + unsigned MemOp = MemoryFoldTable1[i].MemOp; + unsigned Flags = MemoryFoldTable1[i].Flags; AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, RegOp, MemOp, // Index 1, folded load Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } - static const X86OpTblEntry OpTbl2[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, @@ -712,7 +874,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, @@ -782,7 +946,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, + { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, + + // FIXME: We should not be folding Fs* scalar loads into vector + // instructions because the vector instructions require vector-sized + // loads. Lowering should create vector-sized instructions (the Fv* + // variants below) to allow load folding. { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, @@ -791,6 +964,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + + { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, + { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, + { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, + { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, + { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, + { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, + { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, + { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -809,16 +991,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, @@ -842,7 +1030,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 }, { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, + { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 }, { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, @@ -857,7 +1047,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, - { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, + { X86::PINSRBrr, X86::PINSRBrm, 0 }, + { X86::PINSRDrr, X86::PINSRDrm, 0 }, + { X86::PINSRQrr, X86::PINSRQrm, 0 }, + { X86::PINSRWrri, X86::PINSRWrmi, 0 }, { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, @@ -895,8 +1088,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, + { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 }, { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, + { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 }, + { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 }, { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, @@ -918,7 +1114,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, @@ -930,6 +1128,79 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR8rr, X86::XOR8rm, 0 }, { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, + { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, + { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, + { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, + { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, + { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, + { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, + { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, + { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, + { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, + { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, + { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, + { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 }, + { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, + { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, + { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, + { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, + { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, + { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, + { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, + { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, + { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, + { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, + { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 }, + { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 }, + { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 }, + { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 }, + { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 }, + { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 }, + { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 }, + { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 }, + { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, + { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, + { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, + { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, + { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, + { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 }, + { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, + { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, + { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, + { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, + { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, + { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, + { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 }, + { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 }, + { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 }, + { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 }, + { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 }, + { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 }, + { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 }, + { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 }, + { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 }, + { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, + { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, + { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, + { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, + { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, + { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, + { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, + { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, + { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, + { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, + { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, + { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, + { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, + { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, + { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 }, + { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 }, + { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 }, + { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, + // AVX 128-bit versions of foldable instructions { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, @@ -943,13 +1214,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, @@ -967,15 +1241,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, - { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, - { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, - { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 }, - { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 }, - { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, - { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, - { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDPPDrri, X86::VDPPDrmi, 0 }, + { X86::VDPPSrri, X86::VDPPSrmi, 0 }, + // Do not fold VFs* loads because there are no scalar load variants for + // these instructions. When folded, the load is required to be 128-bits, so + // the load size would not match. + { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, + { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, + { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, + { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, + { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, + { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, + { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, + { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, @@ -985,16 +1266,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, @@ -1014,7 +1301,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDrr, X86::VPANDrm, 0 }, { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, + { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, + { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 }, { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, @@ -1031,6 +1320,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, + { X86::VPINSRBrr, X86::VPINSRBrm, 0 }, + { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, + { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, @@ -1069,8 +1361,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, + { X86::VPSUBQrr, X86::VPSUBQrm, 0 }, { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, + { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 }, + { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 }, { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, @@ -1086,13 +1381,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, { X86::VXORPDrr, X86::VXORPDrm, 0 }, { X86::VXORPSrr, X86::VXORPSrm, 0 }, + // AVX 256-bit foldable instructions { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, @@ -1110,6 +1408,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, @@ -1136,6 +1435,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, + // AVX2 foldable instructions { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, @@ -1157,6 +1457,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, + { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 }, { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, @@ -1168,9 +1469,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, - { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, - { X86::VPERMQYri, X86::VPERMQYmi, 0 }, { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, @@ -1225,8 +1524,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, + { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 }, { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, + { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 }, + { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 }, { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, @@ -1237,41 +1539,81 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, { X86::VPXORYrr, X86::VPXORYrm, 0 }, - // FIXME: add AVX 256-bit foldable instructions // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, - { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, - { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVmr, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVmrY, 0 }, + { X86::VPCOMBri, X86::VPCOMBmi, 0 }, + { X86::VPCOMDri, X86::VPCOMDmi, 0 }, + { X86::VPCOMQri, X86::VPCOMQmi, 0 }, + { X86::VPCOMWri, X86::VPCOMWmi, 0 }, + { X86::VPCOMUBri, X86::VPCOMUBmi, 0 }, + { X86::VPCOMUDri, X86::VPCOMUDmi, 0 }, + { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, + { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 }, + { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, + { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, + { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, + { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 }, + { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 }, + { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 }, + { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 }, + { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 }, + { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 }, + { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, + { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, + { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, + { X86::VPPERMrr, X86::VPPERMmr, 0 }, + { X86::VPROTBrr, X86::VPROTBrm, 0 }, + { X86::VPROTDrr, X86::VPROTDrm, 0 }, + { X86::VPROTQrr, X86::VPROTQrm, 0 }, + { X86::VPROTWrr, X86::VPROTWrm, 0 }, + { X86::VPSHABrr, X86::VPSHABrm, 0 }, + { X86::VPSHADrr, X86::VPSHADrm, 0 }, + { X86::VPSHAQrr, X86::VPSHAQrm, 0 }, + { X86::VPSHAWrr, X86::VPSHAWrm, 0 }, + { X86::VPSHLBrr, X86::VPSHLBrm, 0 }, + { X86::VPSHLDrr, X86::VPSHLDrm, 0 }, + { X86::VPSHLQrr, X86::VPSHLQrm, 0 }, + { X86::VPSHLWrr, X86::VPSHLWrm, 0 }, // BMI/BMI2 foldable instructions { X86::ANDN32rr, X86::ANDN32rm, 0 }, @@ -1321,16 +1663,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, - { X86::VAESDECLASTrr, X86::VAESDECLASTrm, TB_ALIGN_16 }, - { X86::VAESDECrr, X86::VAESDECrm, TB_ALIGN_16 }, - { X86::VAESENCLASTrr, X86::VAESENCLASTrm, TB_ALIGN_16 }, - { X86::VAESENCrr, X86::VAESENCrm, TB_ALIGN_16 }, + { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 }, + { X86::VAESDECrr, X86::VAESDECrm, 0 }, + { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 }, + { X86::VAESENCrr, X86::VAESENCrm, 0 }, // SHA foldable instructions { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, @@ -1339,20 +1694,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, - { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }, + { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { - unsigned RegOp = OpTbl2[i].RegOp; - unsigned MemOp = OpTbl2[i].MemOp; - unsigned Flags = OpTbl2[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) { + unsigned RegOp = MemoryFoldTable2[i].RegOp; + unsigned MemOp = MemoryFoldTable2[i].MemOp; + unsigned Flags = MemoryFoldTable2[i].Flags; AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, RegOp, MemOp, // Index 2, folded load Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } - static const X86OpTblEntry OpTbl3[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, @@ -1493,6 +1848,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVrm, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVrmY, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, + { X86::VPPERMrr, X86::VPPERMrm, 0 }, + // AVX-512 VPERMI instructions with 3 source operands. { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, @@ -1501,19 +1866,114 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, - { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 } + { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 }, + { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, + // AVX-512 arithmetic instructions + { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, + { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, + { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 256-bit + { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, + { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, + { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, + { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, + { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, + { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, + { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, + { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 128-bit + { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, + { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, + { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, + { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, + { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, + { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, + { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, + { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { - unsigned RegOp = OpTbl3[i].RegOp; - unsigned MemOp = OpTbl3[i].MemOp; - unsigned Flags = OpTbl3[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) { + unsigned RegOp = MemoryFoldTable3[i].RegOp; + unsigned MemOp = MemoryFoldTable3[i].MemOp; + unsigned Flags = MemoryFoldTable3[i].Flags; AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, RegOp, MemOp, // Index 3, folded load Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } + static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { + // AVX-512 foldable instructions + { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, + { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, + { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, + // AVX-512{F,VL} foldable instructions 256-bit + { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, + { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, + { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, + { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, + { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, + { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, + { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, + { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, + // AVX-512{F,VL} foldable instructions 128-bit + { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, + { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, + { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, + { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, + { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, + { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, + { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, + { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, + { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } + }; + + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) { + unsigned RegOp = MemoryFoldTable4[i].RegOp; + unsigned MemOp = MemoryFoldTable4[i].MemOp; + unsigned Flags = MemoryFoldTable4[i].Flags; + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + RegOp, MemOp, + // Index 4, folded load + Flags | TB_INDEX_4 | TB_FOLDED_LOAD); + } } void @@ -1579,7 +2039,59 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, return false; } -/// isFrameOperand - Return true and the FrameIndex if the specified +int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + + if (MI->getOpcode() == getCallFrameSetupOpcode() || + MI->getOpcode() == getCallFrameDestroyOpcode()) { + unsigned StackAlign = TFI->getStackAlignment(); + int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * + StackAlign; + + SPAdj -= MI->getOperand(1).getImm(); + + if (MI->getOpcode() == getCallFrameSetupOpcode()) + return SPAdj; + else + return -SPAdj; + } + + // To know whether a call adjusts the stack, we need information + // that is bound to the following ADJCALLSTACKUP pseudo. + // Look for the next ADJCALLSTACKUP that follows the call. + if (MI->isCall()) { + const MachineBasicBlock* MBB = MI->getParent(); + auto I = ++MachineBasicBlock::const_iterator(MI); + for (auto E = MBB->end(); I != E; ++I) { + if (I->getOpcode() == getCallFrameDestroyOpcode() || + I->isCall()) + break; + } + + // If we could not find a frame destroy opcode, then it has already + // been simplified, so we don't care. + if (I->getOpcode() != getCallFrameDestroyOpcode()) + return 0; + + return -(I->getOperand(1).getImm()); + } + + // Currently handle only PUSHes we can reasonably expect to see + // in call sequences + switch (MI->getOpcode()) { + default: + return 0; + case X86::PUSH32i8: + case X86::PUSH32r: + case X86::PUSH32rmm: + case X86::PUSH32rmr: + case X86::PUSHi32: + return 4; + } +} + +/// Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const { @@ -1706,8 +2218,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, return 0; } -/// regIsPICBase - Return true if register is PIC base (i.e.g defined by -/// X86::MOVPC32r. +/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) @@ -1903,8 +2414,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); } -/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that -/// is not marked dead. +/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. static bool hasLiveCondCodeDef(MachineInstr *MI) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); @@ -1916,8 +2426,7 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) { return false; } -/// getTruncatedShiftCount - check whether the shift count for a machine operand -/// is non-zero. +/// Check whether the shift count for a machine operand is non-zero. inline static unsigned getTruncatedShiftCount(MachineInstr *MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. @@ -1926,7 +2435,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, return Imm & ShiftCountMask; } -/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate +/// Check whether the given shift count is appropriate /// can be represented by a LEA instruction. inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address @@ -2008,10 +2517,9 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, return true; } -/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when -/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting -/// to a 32-bit superregister and then truncating back down to a 16-bit -/// subregister. +/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit +/// LEA to form 3-address code by promoting to a 32-bit superregister and then +/// truncating back down to a 16-bit subregister. MachineInstr * X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, @@ -2058,11 +2566,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, break; } case X86::INC16r: - case X86::INC64_16r: addRegOffset(MIB, leaInReg, true, 1); break; case X86::DEC16r: - case X86::DEC64_16r: addRegOffset(MIB, leaInReg, true, -1); break; case X86::ADD16ri: @@ -2120,7 +2626,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, return ExtMI; } -/// convertToThreeAddress - This method must be implemented by targets that +/// This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true /// three-address instruction on demand. This allows the X86 target (for @@ -2156,6 +2662,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { + default: return nullptr; case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); @@ -2210,185 +2717,175 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } - default: { + case X86::INC64r: + case X86::INC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - switch (MIOpc) { - default: return nullptr; - case X86::INC64r: - case X86::INC32r: - case X86::INC64_32r: { - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + NewMI = addOffset(MIB, 1); + break; + } + case X86::INC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); + break; + case X86::DEC64r: + case X86::DEC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - NewMI = addOffset(MIB, 1); - break; - } - case X86::INC16r: - case X86::INC64_16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), 1); - break; - case X86::DEC64r: - case X86::DEC32r: - case X86::DEC64_32r: { - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); - NewMI = addOffset(MIB, -1); + NewMI = addOffset(MIB, -1); - break; - } - case X86::DEC16r: - case X86::DEC64_16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), -1); - break; - case X86::ADD64rr: - case X86::ADD64rr_DB: - case X86::ADD32rr: - case X86::ADD32rr_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc; - if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) - Opc = X86::LEA64r; - else - Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + break; + } + case X86::DEC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); + break; + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc; + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) + Opc = X86::LEA64r; + else + Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - const MachineOperand &Src2 = MI->getOperand(2); - bool isKill2, isUndef2; - unsigned SrcReg2; - MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, - SrcReg2, isKill2, isUndef2, ImplicitOp2)) - return nullptr; + const MachineOperand &Src2 = MI->getOperand(2); + bool isKill2, isUndef2; + unsigned SrcReg2; + MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, + SrcReg2, isKill2, isUndef2, ImplicitOp2)) + return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); - if (ImplicitOp2.getReg() != 0) - MIB.addOperand(ImplicitOp2); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + if (ImplicitOp2.getReg() != 0) + MIB.addOperand(ImplicitOp2); - NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); + NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); - // Preserve undefness of the operands. - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); + // Preserve undefness of the operands. + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); - if (LV && Src2.isKill()) - LV->replaceKillInstruction(SrcReg2, MI, NewMI); - break; - } - case X86::ADD16rr: - case X86::ADD16rr_DB: { - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); - NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); - - // Preserve undefness of the operands. - bool isUndef = MI->getOperand(1).isUndef(); - bool isUndef2 = MI->getOperand(2).isUndef(); - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); - - if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, NewMI); - break; - } - case X86::ADD64ri32: - case X86::ADD64ri8: - case X86::ADD64ri32_DB: - case X86::ADD64ri8_DB: - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); - break; - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD32ri_DB: - case X86::ADD32ri8_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + if (LV && Src2.isKill()) + LV->replaceKillInstruction(SrcReg2, MI, NewMI); + break; + } + case X86::ADD16rr: + case X86::ADD16rr_DB: { + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - NewMI = addOffset(MIB, MI->getOperand(2).getImm()); - break; - } - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri_DB: - case X86::ADD16ri8_DB: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); - break; - } + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, MI->getOperand(2).getImm()); + break; } + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; } if (!NewMI) return nullptr; @@ -2404,8 +2901,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// commuteInstruction - We have a few instructions that must be hacked on to -/// commute them. +/// We have a few instructions that must be hacked on to commute them. /// MachineInstr * X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { @@ -2473,6 +2969,71 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI->getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstruction(MI, NewMI); } + case X86::PCLMULQDQrr: + case X86::VPCLMULQDQrr:{ + // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] + // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] + unsigned Imm = MI->getOperand(3).getImm(); + unsigned Src1Hi = Imm & 0x01; + unsigned Src2Hi = Imm & 0x10; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + return TargetInstrInfo::commuteInstruction(MI, NewMI); + default: + return nullptr; + } + } + case X86::VPCOMBri: case X86::VPCOMUBri: + case X86::VPCOMDri: case X86::VPCOMUDri: + case X86::VPCOMQri: case X86::VPCOMUQri: + case X86::VPCOMWri: case X86::VPCOMUWri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: Imm = 0x02; break; // LT -> GT + case 0x01: Imm = 0x03; break; // LE -> GE + case 0x02: Imm = 0x00; break; // GT -> LT + case 0x03: Imm = 0x01; break; // GE -> LE + case 0x04: // EQ + case 0x05: // NE + case 0x06: // FALSE + case 0x07: // TRUE + default: + break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -2557,20 +3118,26 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { - case X86::BLENDPDrri: - case X86::BLENDPSrri: - case X86::PBLENDWrri: - case X86::VBLENDPDrri: - case X86::VBLENDPSrri: - case X86::VBLENDPDYrri: - case X86::VBLENDPSYrri: - case X86::VPBLENDDrri: - case X86::VPBLENDDYrri: - case X86::VPBLENDWrri: - case X86::VPBLENDWYrri: - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + SrcOpIdx1 = 1; + SrcOpIdx2 = 2; + return true; + } + return false; + } case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: @@ -2606,26 +3173,26 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; - case X86::JE_4: return X86::COND_E; - case X86::JNE_4: return X86::COND_NE; - case X86::JL_4: return X86::COND_L; - case X86::JLE_4: return X86::COND_LE; - case X86::JG_4: return X86::COND_G; - case X86::JGE_4: return X86::COND_GE; - case X86::JB_4: return X86::COND_B; - case X86::JBE_4: return X86::COND_BE; - case X86::JA_4: return X86::COND_A; - case X86::JAE_4: return X86::COND_AE; - case X86::JS_4: return X86::COND_S; - case X86::JNS_4: return X86::COND_NS; - case X86::JP_4: return X86::COND_P; - case X86::JNP_4: return X86::COND_NP; - case X86::JO_4: return X86::COND_O; - case X86::JNO_4: return X86::COND_NO; + case X86::JE_1: return X86::COND_E; + case X86::JNE_1: return X86::COND_NE; + case X86::JL_1: return X86::COND_L; + case X86::JLE_1: return X86::COND_LE; + case X86::JG_1: return X86::COND_G; + case X86::JGE_1: return X86::COND_GE; + case X86::JB_1: return X86::COND_B; + case X86::JBE_1: return X86::COND_BE; + case X86::JA_1: return X86::COND_A; + case X86::JAE_1: return X86::COND_AE; + case X86::JS_1: return X86::COND_S; + case X86::JNS_1: return X86::COND_NS; + case X86::JP_1: return X86::COND_P; + case X86::JNP_1: return X86::COND_NP; + case X86::JO_1: return X86::COND_O; + case X86::JNO_1: return X86::COND_NO; } } -/// getCondFromSETOpc - return condition code of a SET opcode. +/// Return condition code of a SET opcode. static X86::CondCode getCondFromSETOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; @@ -2648,7 +3215,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) { } } -/// getCondFromCmovOpc - return condition code of a CMov opcode. +/// Return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; @@ -2706,26 +3273,26 @@ X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); - case X86::COND_E: return X86::JE_4; - case X86::COND_NE: return X86::JNE_4; - case X86::COND_L: return X86::JL_4; - case X86::COND_LE: return X86::JLE_4; - case X86::COND_G: return X86::JG_4; - case X86::COND_GE: return X86::JGE_4; - case X86::COND_B: return X86::JB_4; - case X86::COND_BE: return X86::JBE_4; - case X86::COND_A: return X86::JA_4; - case X86::COND_AE: return X86::JAE_4; - case X86::COND_S: return X86::JS_4; - case X86::COND_NS: return X86::JNS_4; - case X86::COND_P: return X86::JP_4; - case X86::COND_NP: return X86::JNP_4; - case X86::COND_O: return X86::JO_4; - case X86::COND_NO: return X86::JNO_4; + case X86::COND_E: return X86::JE_1; + case X86::COND_NE: return X86::JNE_1; + case X86::COND_L: return X86::JL_1; + case X86::COND_LE: return X86::JLE_1; + case X86::COND_G: return X86::JG_1; + case X86::COND_GE: return X86::JGE_1; + case X86::COND_B: return X86::JB_1; + case X86::COND_BE: return X86::JBE_1; + case X86::COND_A: return X86::JA_1; + case X86::COND_AE: return X86::JAE_1; + case X86::COND_S: return X86::JS_1; + case X86::COND_NS: return X86::JNS_1; + case X86::COND_P: return X86::JP_1; + case X86::COND_NP: return X86::JNP_1; + case X86::COND_O: return X86::JO_1; + case X86::COND_NO: return X86::JNO_1; } } -/// GetOppositeBranchCondition - Return the inverse of the specified condition, +/// Return the inverse of the specified condition, /// e.g. turning COND_E to COND_NE. X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { switch (CC) { @@ -2749,9 +3316,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { } } -/// getSwappedCondition - assume the flags are set by MI(a,b), return -/// the condition code if we modify the instructions such that flags are -/// set by MI(b,a). +/// Assuming the flags are set by MI(a,b), return the condition code if we +/// modify the instructions such that flags are set by MI(b,a). static X86::CondCode getSwappedCondition(X86::CondCode CC) { switch (CC) { default: return X86::COND_INVALID; @@ -2768,7 +3334,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { } } -/// getSETFromCond - Return a set opcode for the given condition and +/// Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { static const uint16_t Opc[16][2] = { @@ -2794,7 +3360,7 @@ unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { return Opc[CC][HasMemoryOperand ? 1 : 0]; } -/// getCMovFromCond - Return a cmov opcode for the given condition, +/// Return a cmov opcode for the given condition, /// register size in bytes, and operand type. unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand) { @@ -2879,7 +3445,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; // Handle unconditional branches. - if (I->getOpcode() == X86::JMP_4) { + if (I->getOpcode() == X86::JMP_1) { UnCondBrIter = I; if (!AllowModify) { @@ -2941,7 +3507,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) .addMBB(UnCondBrIter->getOperand(0).getMBB()); - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); OldInst->eraseFromParent(); @@ -3006,7 +3572,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { --I; if (I->isDebugValue()) continue; - if (I->getOpcode() != X86::JMP_4 && + if (I->getOpcode() != X86::JMP_1 && getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. @@ -3031,7 +3597,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, if (Cond.empty()) { // Unconditional branch? assert(!FBB && "Unconditional branch with multiple successors!"); - BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); return 1; } @@ -3041,16 +3607,16 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, switch (CC) { case X86::COND_NP_OR_E: // Synthesize NP_OR_E with two branches. - BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); ++Count; - BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); ++Count; break; case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. - BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); ++Count; - BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; default: { @@ -3061,7 +3627,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, } if (FBB) { // Two-way Conditional branch. Insert the second branch. - BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB); + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; } return Count; @@ -3117,7 +3683,7 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } -/// isHReg - Test if the given register is a physical h register. +/// Test if the given register is a physical h register. static bool isHReg(unsigned Reg) { return X86::GR8_ABCD_HRegClass.contains(Reg); } @@ -3389,11 +3955,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) @@ -3428,11 +3992,9 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); @@ -3528,7 +4090,7 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, return false; } -/// isRedundantFlagInstr - check whether the first instruction, whose only +/// Check whether the first instruction, whose only /// purpose is to update flags, can be made redundant. /// CMPrr can be made redundant by SUBrr if the operands are the same. /// This function can be extended later on. @@ -3571,7 +4133,7 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, return false; } -/// isDefConvertible - check whether the definition can be converted +/// Check whether the definition can be converted /// to remove a comparison against zero. inline static bool isDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -3601,14 +4163,12 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: - case X86::DEC64_32r: case X86::DEC64_16r: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: - case X86::INC64_32r: case X86::INC64_16r: case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: @@ -3659,8 +4219,7 @@ inline static bool isDefConvertible(MachineInstr *MI) { } } -/// isUseDefConvertible - check whether the use can be converted -/// to remove a comparison against zero. +/// Check whether the use can be converted to remove a comparison against zero. static X86::CondCode isUseDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return X86::COND_INVALID; @@ -3679,7 +4238,7 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) { } } -/// optimizeCompareInstr - Check if there exists an earlier instruction that +/// Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. bool X86InstrInfo:: @@ -3970,7 +4529,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, return true; } -/// optimizeLoadInstr - Try to remove the load by folding it to a register +/// Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. @@ -4025,9 +4584,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; } -/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr -/// instruction with two undef reads of the register being defined. This is -/// used for mapping: +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. +/// This is used for mapping: /// %xmm4 = V_SET0 /// to: /// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> @@ -4099,7 +4658,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; - case X86::KSET0B: + case X86::KSET0B: case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); @@ -4179,7 +4738,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, unsigned i, + MachineInstr *MI, unsigned OpNum, const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Align, bool AllowCommute) const { @@ -4188,12 +4747,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // Atom favors register form of call. So, we do not fold loads into calls - // when X86Subtarget is Atom. + // For CPUs that favor the register form of a call, + // do not fold loads into calls. if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) return nullptr; - } unsigned NumOps = MI->getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && @@ -4209,13 +4767,13 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. - if (isTwoAddr && NumOps >= 2 && i < 2 && + if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI->getOperand(0).isReg() && MI->getOperand(1).isReg() && MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; - } else if (i == 0) { // If operand 0 + } else if (OpNum == 0) { if (MI->getOpcode() == X86::MOV32r0) { NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); if (NewMI) @@ -4223,12 +4781,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (i == 1) { + } else if (OpNum == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (i == 2) { + } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (i == 3) { + } else if (OpNum == 3) { OpcodeTablePtr = &RegOp2MemOpTable3; + } else if (OpNum == 4) { + OpcodeTablePtr = &RegOp2MemOpTable4; } // If table selected... @@ -4243,7 +4803,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return nullptr; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -4262,7 +4822,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (isTwoAddrFold) NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this); else - NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this); + NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this); if (NarrowToMOV32rm) { // If this is the special case where we use a MOV32rm to load a 32-bit @@ -4281,7 +4841,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; @@ -4339,11 +4899,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // No fusion if (PrintFailedFusing && !MI->isCopy()) - dbgs() << "We failed to fuse operand " << i << " in " << *MI; + dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI; return nullptr; } -/// hasPartialRegUpdate - Return true for all instructions that only update +/// Return true for all instructions that only update /// the first 32 or 64-bits of the destination register and leave the rest /// unmodified. This can be used to avoid folding loads if the instructions /// only update part of the destination register, and the non-updated part is @@ -4362,30 +4922,50 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, static bool hasPartialRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::CVTSI2SSrr: + case X86::CVTSI2SSrm: case X86::CVTSI2SS64rr: + case X86::CVTSI2SS64rm: case X86::CVTSI2SDrr: + case X86::CVTSI2SDrm: case X86::CVTSI2SD64rr: + case X86::CVTSI2SD64rm: case X86::CVTSD2SSrr: + case X86::CVTSD2SSrm: case X86::Int_CVTSD2SSrr: + case X86::Int_CVTSD2SSrm: case X86::CVTSS2SDrr: + case X86::CVTSS2SDrm: case X86::Int_CVTSS2SDrr: + case X86::Int_CVTSS2SDrm: case X86::RCPSSr: + case X86::RCPSSm: case X86::RCPSSr_Int: + case X86::RCPSSm_Int: case X86::ROUNDSDr: + case X86::ROUNDSDm: case X86::ROUNDSDr_Int: case X86::ROUNDSSr: + case X86::ROUNDSSm: case X86::ROUNDSSr_Int: case X86::RSQRTSSr: + case X86::RSQRTSSm: case X86::RSQRTSSr_Int: + case X86::RSQRTSSm_Int: case X86::SQRTSSr: + case X86::SQRTSSm: case X86::SQRTSSr_Int: + case X86::SQRTSSm_Int: + case X86::SQRTSDr: + case X86::SQRTSDm: + case X86::SQRTSDr_Int: + case X86::SQRTSDm_Int: return true; } return false; } -/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle +/// Inform the ExeDepsFix pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo:: getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, @@ -4415,28 +4995,52 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, static bool hasUndefRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::VCVTSI2SSrr: + case X86::VCVTSI2SSrm: case X86::Int_VCVTSI2SSrr: + case X86::Int_VCVTSI2SSrm: case X86::VCVTSI2SS64rr: + case X86::VCVTSI2SS64rm: case X86::Int_VCVTSI2SS64rr: + case X86::Int_VCVTSI2SS64rm: case X86::VCVTSI2SDrr: + case X86::VCVTSI2SDrm: case X86::Int_VCVTSI2SDrr: + case X86::Int_VCVTSI2SDrm: case X86::VCVTSI2SD64rr: + case X86::VCVTSI2SD64rm: case X86::Int_VCVTSI2SD64rr: + case X86::Int_VCVTSI2SD64rm: case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: case X86::Int_VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrm: case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: case X86::Int_VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrm: case X86::VRCPSSr: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: case X86::VROUNDSDr: + case X86::VROUNDSDm: case X86::VROUNDSDr_Int: case X86::VROUNDSSr: + case X86::VROUNDSSm: case X86::VROUNDSSr_Int: case X86::VRSQRTSSr: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: - - // AVX-512 + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrm: case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrm: return true; } @@ -4509,8 +5113,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; @@ -4520,10 +5123,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min(Alignment, MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment()); + Alignment = + std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4587,8 +5188,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; @@ -4743,7 +5343,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { // If operand 0 + } else if (OpNum == 0) { if (Opc == X86::MOV32r0) return true; @@ -4986,7 +5586,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, NewNodes.push_back(Store); // Preserve memory reference information. - cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second); } return true; @@ -5181,26 +5781,26 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, switch(Second->getOpcode()) { default: return false; - case X86::JE_4: - case X86::JNE_4: - case X86::JL_4: - case X86::JLE_4: - case X86::JG_4: - case X86::JGE_4: + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: FuseKind = FuseInc; break; - case X86::JB_4: - case X86::JBE_4: - case X86::JA_4: - case X86::JAE_4: + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: FuseKind = FuseCmp; break; - case X86::JS_4: - case X86::JNS_4: - case X86::JP_4: - case X86::JNP_4: - case X86::JO_4: - case X86::JNO_4: + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: FuseKind = FuseTest; break; } @@ -5313,14 +5913,10 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, return FuseKind == FuseCmp || FuseKind == FuseInc; case X86::INC16r: case X86::INC32r: - case X86::INC64_16r: - case X86::INC64_32r: case X86::INC64r: case X86::INC8r: case X86::DEC16r: case X86::DEC32r: - case X86::DEC64_16r: - case X86::DEC64_32r: case X86::DEC64r: case X86::DEC8r: return FuseKind == FuseInc; @@ -5345,7 +5941,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); } -/// getGlobalBaseReg - Return a virtual register initialized with the +/// Return a virtual register initialized with the /// the global base register value. Output instructions required to /// initialize the register in the function entry block, if necessary. /// @@ -5478,7 +6074,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { MI->setDesc(get(table[Domain-1])); } -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +/// Return the noop instruction to use for a noop. void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } @@ -5489,7 +6085,7 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { // getUnconditionalBranch and getTrap. void X86InstrInfo::getUnconditionalBranch( MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { - Branch.setOpcode(X86::JMP_4); + Branch.setOpcode(X86::JMP_1); Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); } @@ -5595,7 +6191,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, } namespace { - /// CGBR - Create Global Base Reg pass. This initializes the PIC + /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. struct CGBR : public MachineFunctionPass { static char ID; @@ -5604,10 +6200,11 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF.getTarget()); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); // Don't do anything if this is 64-bit as 64-bit PIC // uses RIP relative addressing. - if (TM->getSubtarget<X86Subtarget>().is64Bit()) + if (STI.is64Bit()) return false; // Only emit a global base reg in PIC mode. @@ -5626,10 +6223,10 @@ namespace { MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86InstrInfo *TII = STI.getInstrInfo(); unsigned PC; - if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) + if (STI.isPICStyleGOT()) PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else PC = GlobalBaseReg; @@ -5640,7 +6237,7 @@ namespace { // If we're using vanilla 'GOT' PIC style, we should use relative addressing // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. - if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) { + if (STI.isPICStyleGOT()) { // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", @@ -5721,10 +6318,9 @@ namespace { MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, unsigned TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const X86TargetMachine *TM = - static_cast<const X86TargetMachine *>(&MF->getTarget()); - const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), @@ -5742,10 +6338,9 @@ namespace { // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const X86TargetMachine *TM = - static_cast<const X86TargetMachine *>(&MF->getTarget()); - const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 57b1958..4d15467 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -152,6 +152,7 @@ class X86InstrInfo final : public X86GenInstrInfo { RegOp2MemOpTableType RegOp2MemOpTable1; RegOp2MemOpTableType RegOp2MemOpTable2; RegOp2MemOpTableType RegOp2MemOpTable3; + RegOp2MemOpTableType RegOp2MemOpTable4; /// MemOp2RegOpTable - Load / store unfolding opcode map. /// @@ -174,6 +175,11 @@ public: /// const X86RegisterInfo &getRegisterInfo() const { return RI; } + /// getSPAdjust - This returns the stack pointer adjustment made by + /// this instruction. For x86, we need to handle more complex call + /// sequences involving PUSHes. + int getSPAdjust(const MachineInstr *MI) const override; + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" /// extension instruction. That is, it's like a copy where it's legal for the /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 3dbf819..9881caf 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -32,7 +32,8 @@ def SDTX86Cmov : SDTypeProfile<1, 4, // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, - [SDTCisInt<0>, SDTCisVT<1, i32>]>; + [SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, @@ -188,11 +189,15 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; +def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; @@ -261,121 +266,75 @@ def ptr_rc_nosp : PointerLikeRegClass<1>; def X86MemAsmOperand : AsmOperandClass { let Name = "Mem"; } -def X86Mem8AsmOperand : AsmOperandClass { - let Name = "Mem8"; let RenderMethod = "addMemOperands"; -} -def X86Mem16AsmOperand : AsmOperandClass { - let Name = "Mem16"; let RenderMethod = "addMemOperands"; -} -def X86Mem32AsmOperand : AsmOperandClass { - let Name = "Mem32"; let RenderMethod = "addMemOperands"; -} -def X86Mem64AsmOperand : AsmOperandClass { - let Name = "Mem64"; let RenderMethod = "addMemOperands"; -} -def X86Mem80AsmOperand : AsmOperandClass { - let Name = "Mem80"; let RenderMethod = "addMemOperands"; -} -def X86Mem128AsmOperand : AsmOperandClass { - let Name = "Mem128"; let RenderMethod = "addMemOperands"; -} -def X86Mem256AsmOperand : AsmOperandClass { - let Name = "Mem256"; let RenderMethod = "addMemOperands"; -} -def X86Mem512AsmOperand : AsmOperandClass { - let Name = "Mem512"; let RenderMethod = "addMemOperands"; -} - -// Gather mem operands -def X86MemVX32Operand : AsmOperandClass { - let Name = "MemVX32"; let RenderMethod = "addMemOperands"; -} -def X86MemVY32Operand : AsmOperandClass { - let Name = "MemVY32"; let RenderMethod = "addMemOperands"; -} -def X86MemVZ32Operand : AsmOperandClass { - let Name = "MemVZ32"; let RenderMethod = "addMemOperands"; -} -def X86MemVX64Operand : AsmOperandClass { - let Name = "MemVX64"; let RenderMethod = "addMemOperands"; -} -def X86MemVY64Operand : AsmOperandClass { - let Name = "MemVY64"; let RenderMethod = "addMemOperands"; -} -def X86MemVZ64Operand : AsmOperandClass { - let Name = "MemVZ64"; let RenderMethod = "addMemOperands"; +let RenderMethod = "addMemOperands" in { + def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } + def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } + def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } + def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } + def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } + def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } + def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } + def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } + // Gather mem operands + def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; } + def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; } + def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; } + def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; } + def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; } + def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; } } def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; } -class X86MemOperand<string printMethod> : Operand<iPTR> { + +class X86MemOperand<string printMethod, + AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> { let PrintMethod = printMethod; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = parserMatchClass; + let OperandType = "OPERAND_MEMORY"; } -let OperandType = "OPERAND_MEMORY" in { +// Gather mem operands +class X86VMemOperand<RegisterClass RC, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm); +} + +def anymem : X86MemOperand<"printanymem">; + def opaque32mem : X86MemOperand<"printopaquemem">; def opaque48mem : X86MemOperand<"printopaquemem">; def opaque80mem : X86MemOperand<"printopaquemem">; def opaque512mem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem"> { - let ParserMatchClass = X86Mem8AsmOperand; } -def i16mem : X86MemOperand<"printi16mem"> { - let ParserMatchClass = X86Mem16AsmOperand; } -def i32mem : X86MemOperand<"printi32mem"> { - let ParserMatchClass = X86Mem32AsmOperand; } -def i64mem : X86MemOperand<"printi64mem"> { - let ParserMatchClass = X86Mem64AsmOperand; } -def i128mem : X86MemOperand<"printi128mem"> { - let ParserMatchClass = X86Mem128AsmOperand; } -def i256mem : X86MemOperand<"printi256mem"> { - let ParserMatchClass = X86Mem256AsmOperand; } -def i512mem : X86MemOperand<"printi512mem"> { - let ParserMatchClass = X86Mem512AsmOperand; } -def f32mem : X86MemOperand<"printf32mem"> { - let ParserMatchClass = X86Mem32AsmOperand; } -def f64mem : X86MemOperand<"printf64mem"> { - let ParserMatchClass = X86Mem64AsmOperand; } -def f80mem : X86MemOperand<"printf80mem"> { - let ParserMatchClass = X86Mem80AsmOperand; } -def f128mem : X86MemOperand<"printf128mem"> { - let ParserMatchClass = X86Mem128AsmOperand; } -def f256mem : X86MemOperand<"printf256mem">{ - let ParserMatchClass = X86Mem256AsmOperand; } -def f512mem : X86MemOperand<"printf512mem">{ - let ParserMatchClass = X86Mem512AsmOperand; } -def v512mem : Operand<iPTR> { - let PrintMethod = "printf512mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86Mem512AsmOperand; } +def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; + +def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; // Gather mem operands -def vx32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); - let ParserMatchClass = X86MemVX32Operand; } -def vy32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); - let ParserMatchClass = X86MemVY32Operand; } -def vx64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); - let ParserMatchClass = X86MemVX64Operand; } -def vy64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); - let ParserMatchClass = X86MemVY64Operand; } -def vy64xmem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm); - let ParserMatchClass = X86MemVY64Operand; } -def vz32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86MemVZ32Operand; } -def vz64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86MemVZ64Operand; } -} +def vx32mem : X86VMemOperand<VR128, "printi32mem", X86MemVX32Operand>; +def vy32mem : X86VMemOperand<VR256, "printi32mem", X86MemVY32Operand>; +def vx64mem : X86VMemOperand<VR128, "printi64mem", X86MemVX64Operand>; +def vy64mem : X86VMemOperand<VR256, "printi64mem", X86MemVY64Operand>; +def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64Operand>; +def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; +def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of // plain GR64, so that it doesn't potentially require a REX prefix. @@ -424,125 +383,180 @@ def brtarget8 : Operand<OtherVT>; } -def X86SrcIdx8Operand : AsmOperandClass { - let Name = "SrcIdx8"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86SrcIdx16Operand : AsmOperandClass { - let Name = "SrcIdx16"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86SrcIdx32Operand : AsmOperandClass { - let Name = "SrcIdx32"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86SrcIdx64Operand : AsmOperandClass { - let Name = "SrcIdx64"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -def X86DstIdx8Operand : AsmOperandClass { - let Name = "DstIdx8"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86DstIdx16Operand : AsmOperandClass { - let Name = "DstIdx16"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86DstIdx32Operand : AsmOperandClass { - let Name = "DstIdx32"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86DstIdx64Operand : AsmOperandClass { - let Name = "DstIdx64"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -def X86MemOffs8AsmOperand : AsmOperandClass { - let Name = "MemOffs8"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86MemOffs16AsmOperand : AsmOperandClass { - let Name = "MemOffs16"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86MemOffs32AsmOperand : AsmOperandClass { - let Name = "MemOffs32"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86MemOffs64AsmOperand : AsmOperandClass { - let Name = "MemOffs64"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -let OperandType = "OPERAND_MEMORY" in { -def srcidx8 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx8Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx8"; } -def srcidx16 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx16Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx16"; } -def srcidx32 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx32Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx32"; } -def srcidx64 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx64Operand; +// Special parser to detect 16-bit mode to select 16-bit displacement. +def X86AbsMem16AsmOperand : AsmOperandClass { + let Name = "AbsMem16"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +// Branch targets have OtherVT type and print as pc-relative values. +let OperandType = "OPERAND_PCREL", + PrintMethod = "printPCRelImm" in { +let ParserMatchClass = X86AbsMem16AsmOperand in + def brtarget16 : Operand<OtherVT>; +let ParserMatchClass = X86AbsMemAsmOperand in + def brtarget32 : Operand<OtherVT>; +} + +let RenderMethod = "addSrcIdxOperands" in { + def X86SrcIdx8Operand : AsmOperandClass { + let Name = "SrcIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86SrcIdx16Operand : AsmOperandClass { + let Name = "SrcIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86SrcIdx32Operand : AsmOperandClass { + let Name = "SrcIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86SrcIdx64Operand : AsmOperandClass { + let Name = "SrcIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addSrcIdxOperands" + +let RenderMethod = "addDstIdxOperands" in { + def X86DstIdx8Operand : AsmOperandClass { + let Name = "DstIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86DstIdx16Operand : AsmOperandClass { + let Name = "DstIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86DstIdx32Operand : AsmOperandClass { + let Name = "DstIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86DstIdx64Operand : AsmOperandClass { + let Name = "DstIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addDstIdxOperands" + +let RenderMethod = "addMemOffsOperands" in { + def X86MemOffs16_8AsmOperand : AsmOperandClass { + let Name = "MemOffs16_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs16_16AsmOperand : AsmOperandClass { + let Name = "MemOffs16_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs16_32AsmOperand : AsmOperandClass { + let Name = "MemOffs16_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_8AsmOperand : AsmOperandClass { + let Name = "MemOffs32_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs32_16AsmOperand : AsmOperandClass { + let Name = "MemOffs32_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs32_32AsmOperand : AsmOperandClass { + let Name = "MemOffs32_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_64AsmOperand : AsmOperandClass { + let Name = "MemOffs32_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } + def X86MemOffs64_8AsmOperand : AsmOperandClass { + let Name = "MemOffs64_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs64_16AsmOperand : AsmOperandClass { + let Name = "MemOffs64_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs64_32AsmOperand : AsmOperandClass { + let Name = "MemOffs64_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs64_64AsmOperand : AsmOperandClass { + let Name = "MemOffs64_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addMemOffsOperands" + +class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx64"; } -def dstidx8 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx8Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx8"; } -def dstidx16 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx16Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx16"; } -def dstidx32 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx32Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx32"; } -def dstidx64 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx64Operand; +} + +class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx64"; } -def offset8 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs8AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs8"; } -def offset16 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs16AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs16"; } -def offset32 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs32AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs32"; } -def offset64 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs64AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs64"; } } +def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; +def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; +def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; +def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; +def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; +def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; +def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; +def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; + +class X86MemOffsOperand<Operand immOperand, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops immOperand, i8imm); +} + +def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8", + X86MemOffs16_8AsmOperand>; +def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16", + X86MemOffs16_16AsmOperand>; +def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32", + X86MemOffs16_32AsmOperand>; +def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8", + X86MemOffs32_8AsmOperand>; +def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16", + X86MemOffs32_16AsmOperand>; +def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32", + X86MemOffs32_32AsmOperand>; +def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64", + X86MemOffs32_64AsmOperand>; +def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8", + X86MemOffs64_8AsmOperand>; +def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16", + X86MemOffs64_16AsmOperand>; +def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", + X86MemOffs64_32AsmOperand>; +def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", + X86MemOffs64_64AsmOperand>; def SSECC : Operand<i8> { - let PrintMethod = "printSSECC"; + let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } +def i8immZExt3 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 8; +}]>; + def AVXCC : Operand<i8> { - let PrintMethod = "printAVXCC"; + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def i8immZExt5 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 32; +}]>; + +def AVX512ICC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def XOPCC : Operand<i8> { + let PrintMethod = "printXOPCC"; let OperandType = "OPERAND_IMMEDIATE"; } @@ -599,6 +613,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { ImmSExti64i32AsmOperand]; } +// Unsigned immediate used by SSE/AVX instructions +// [0, 0xFF] +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmUnsignedi8AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi8"; + let RenderMethod = "addImmOperands"; +} + // A couple of more descriptive operand definitions. // 16-bits but only 8 bits are significant. def i16i8imm : Operand<i16> { @@ -617,6 +639,27 @@ def i64i32imm : Operand<i64> { let OperandType = "OPERAND_IMMEDIATE"; } +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Unsigned 8-bit immediate used by SSE/AVX instructions. +def u8imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 32-bit immediate but only 8-bits are significant and they are unsigned. +// Used by some SSE/AVX instructions that use intrinsics. +def i32u8imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand<i64> { @@ -625,21 +668,15 @@ def i64i32imm_pcrel : Operand<i64> { let OperandType = "OPERAND_PCREL"; } -// 64-bits but only 8 bits are significant. -def i64i8imm : Operand<i64> { - let ParserMatchClass = ImmSExti64i8AsmOperand; - let OperandType = "OPERAND_IMMEDIATE"; -} - def lea64_32mem : Operand<i32> { - let PrintMethod = "printi32mem"; + let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } // Memory operands that use 64-bit pointers in both ILP32 and LP64. def lea64mem : Operand<i64> { - let PrintMethod = "printi64mem"; + let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } @@ -676,6 +713,9 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; +def vectoraddr : ComplexPattern<iPTR, 5, "SelectAddr", [],[SDNPWantParent]>; +//def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; + //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def HasCMov : Predicate<"Subtarget->hasCMov()">; @@ -706,14 +746,19 @@ def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; -def HasCDI : Predicate<"Subtarget->hasCDI()">; -def HasPFI : Predicate<"Subtarget->hasPFI()">; -def HasERI : Predicate<"Subtarget->hasERI()">; -def HasDQI : Predicate<"Subtarget->hasDQI()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">, + AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasPFI : Predicate<"Subtarget->hasPFI()">, + AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; +def HasERI : Predicate<"Subtarget->hasERI()">, + AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">; +def HasDQI : Predicate<"Subtarget->hasDQI()">, + AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; -def HasBWI : Predicate<"Subtarget->hasBWI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">, + AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; def HasVLX : Predicate<"Subtarget->hasVLX()">, - AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; + AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -736,10 +781,8 @@ def HasHLE : Predicate<"Subtarget->hasHLE()">; def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; -def HasSGX : Predicate<"Subtarget->hasSGX()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; -def HasSMAP : Predicate<"Subtarget->hasSMAP()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; @@ -757,6 +800,9 @@ def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; +def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; @@ -773,6 +819,7 @@ def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. @@ -803,6 +850,11 @@ def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; +// Predicate used to help when pattern matching LZCNT/TZCNT. +def X86_COND_E_OR_NE : ImmLeaf<i8, [{ + return (Imm == X86::COND_E) || (Imm == X86::COND_NE); +}]>; + let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; @@ -905,7 +957,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ // // Nop -let neverHasSideEffects = 1, SchedRW = [WriteZero] in { +let hasSideEffects = 0, SchedRW = [WriteZero] in { def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; @@ -919,12 +971,12 @@ def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; let SchedRW = [WriteALU] in { -let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[Not64BitMode]>; -let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[In64BitMode]>; @@ -934,7 +986,7 @@ def LEAVE64 : I<0xC9, RawFrm, // Miscellaneous Instructions. // -let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { +let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG16>, OpSize16; @@ -948,11 +1000,6 @@ def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>; - -def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, - OpSize16; -def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, - OpSize32, Requires<[Not64BitMode]>; } // mayLoad, SchedRW let mayStore = 1, SchedRW = [WriteStore] in { @@ -981,16 +1028,26 @@ def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; +} // mayStore, SchedRW +} + +let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, + SchedRW = [WriteLoad] in { +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, + OpSize16; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, + OpSize32, Requires<[Not64BitMode]>; +} +let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0, + SchedRW = [WriteStore] in { def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, OpSize16; def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, OpSize32, Requires<[Not64BitMode]>; - -} // mayStore, SchedRW } -let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { +let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; @@ -1009,7 +1066,7 @@ def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], } // mayStore, SchedRW } -let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1, +let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>; @@ -1021,22 +1078,22 @@ def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), Requires<[In64BitMode]>; } -let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in +let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; -let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in +let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], - mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>, OpSize32, Requires<[Not64BitMode]>; def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>, OpSize16, Requires<[Not64BitMode]>; } let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], - mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { + mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>, OpSize32, Requires<[Not64BitMode]>; def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>, @@ -1166,7 +1223,7 @@ def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), // Move Instructions. // let SchedRW = [WriteMove] in { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), @@ -1225,62 +1282,67 @@ def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), let hasSideEffects = 0 in { -/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a -/// 32-bit offset from the segment base. These are only valid in x86-32 mode. +/// Memory offset versions of moves. The immediate is an address mode sized +/// offset from the segment base. let SchedRW = [WriteALU] in { let mayLoad = 1 in { let Defs = [AL] in -def MOV8o8a : Ii32 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, - Requires<[In32BitMode]>; +def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, + AdSize32; let Defs = [AX] in -def MOV16o16a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, - OpSize16, Requires<[In32BitMode]>; +def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; let Defs = [EAX] in -def MOV32o32a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, - OpSize32, Requires<[In32BitMode]>; +def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Defs = [RAX] in +def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), + "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, + AdSize32; let Defs = [AL] in -def MOV8o8a_16 : Ii16 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, - AdSize, Requires<[In16BitMode]>; +def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16; let Defs = [AX] in -def MOV16o16a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, - OpSize16, AdSize, Requires<[In16BitMode]>; +def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; let Defs = [EAX] in -def MOV32o32a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, - AdSize, OpSize32, Requires<[In16BitMode]>; +def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + AdSize16, OpSize32; } let mayStore = 1 in { let Uses = [AL] in -def MOV8ao8 : Ii32 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, - Requires<[In32BitMode]>; +def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32; let Uses = [AX] in -def MOV16ao16 : Ii32 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, - OpSize16, Requires<[In32BitMode]>; +def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; let Uses = [EAX] in -def MOV32ao32 : Ii32 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, - OpSize32, Requires<[In32BitMode]>; +def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Uses = [RAX] in +def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, + AdSize32; let Uses = [AL] in -def MOV8ao8_16 : Ii16 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, - AdSize, Requires<[In16BitMode]>; +def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16; let Uses = [AX] in -def MOV16ao16_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, - OpSize16, AdSize, Requires<[In16BitMode]>; +def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; let Uses = [EAX] in -def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, - OpSize32, AdSize, Requires<[In16BitMode]>; +def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize16; } } @@ -1288,40 +1350,34 @@ def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), // and use the movabs mnemonic to indicate this specific form. let mayLoad = 1 in { let Defs = [AL] in -def MOV64o8a : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "movabs{b}\t{$src, %al|al, $src}", []>, - Requires<[In64BitMode]>; +def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64; let Defs = [AX] in -def MOV64o16a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, - Requires<[In64BitMode]>; +def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64; let Defs = [EAX] in -def MOV64o32a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), +def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32, - Requires<[In64BitMode]>; + AdSize64; let Defs = [RAX] in -def MOV64o64a : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64:$src), - "movabs{q}\t{$src, %rax|rax, $src}", []>, - Requires<[In64BitMode]>; +def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64; } let mayStore = 1 in { let Uses = [AL] in -def MOV64ao8 : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "movabs{b}\t{%al, $dst|$dst, al}", []>, - Requires<[In64BitMode]>; +def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64; let Uses = [AX] in -def MOV64ao16 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, - Requires<[In64BitMode]>; +def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64; let Uses = [EAX] in -def MOV64ao32 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), +def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins), "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32, - Requires<[In64BitMode]>; + AdSize64; let Uses = [RAX] in -def MOV64ao64 : RIi64<0xA3, RawFrmMemOffs, (outs offset64:$dst), (ins), - "movabs{q}\t{%rax, $dst|$dst, rax}", []>, - Requires<[In64BitMode]>; +def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64; } } // hasSideEffects = 0 @@ -1371,17 +1427,17 @@ def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), // that they can be used for copying and storing h registers, which can't be // encoded when a REX prefix is present. let isCodeGenOnly = 1 in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, Sched<[WriteMove]>; -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, hasSideEffects = 0 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV_MEM>, Sched<[WriteStore]>; -let mayLoad = 1, neverHasSideEffects = 1, +let mayLoad = 1, hasSideEffects = 0, canFoldAsLoad = 1, isReMaterializable = 1 in def MOV8rm_NOREX : I<0x8A, MRMSrcMem, (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), @@ -1395,7 +1451,7 @@ let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", [(set EFLAGS, (X86sahf AH))], IIC_AHF>; -let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in +let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], IIC_AHF>; // AH = flags } // SchedRW @@ -1981,42 +2037,42 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in { } let Predicates = [HasLZCNT] in { - def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E), - (X86cmp GR16:$src, (i16 0))), + def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), (LZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E), + def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (LZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E), + def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (LZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E), + def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (LZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E), + def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (LZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E), + def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (LZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (LZCNT16rm addr:$src)>; - def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (LZCNT32rm addr:$src)>; - def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (LZCNT64rm addr:$src)>; - def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (LZCNT16rm addr:$src)>; - def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (LZCNT32rm addr:$src)>; - def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (LZCNT64rm addr:$src)>; } @@ -2097,42 +2153,42 @@ let Predicates = [HasBMI] in { } let Predicates = [HasBMI] in { - def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E), + def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (TZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E), + def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (TZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E), + def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (TZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E), + def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (TZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E), + def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (TZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E), + def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (TZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (TZCNT16rm addr:$src)>; - def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (TZCNT32rm addr:$src)>; - def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (TZCNT64rm addr:$src)>; - def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (TZCNT16rm addr:$src)>; - def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (TZCNT32rm addr:$src)>; - def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (TZCNT64rm addr:$src)>; } @@ -2167,11 +2223,11 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { def CountTrailingOnes : SDNodeXForm<imm, [{ // Count the trailing ones in the immediate. - return getI8Imm(CountTrailingOnes_64(N->getZExtValue())); + return getI8Imm(countTrailingOnes(N->getZExtValue())); }]>; def BZHIMask : ImmLeaf<i64, [{ - return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32); + return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32); }]>; let Predicates = [HasBMI2] in { @@ -2361,6 +2417,16 @@ let Predicates = [HasTBM] in { } // HasTBM //===----------------------------------------------------------------------===// +// Memory Instructions +// + +def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflushopt\t$src", []>, PD; +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD; +def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD; + + +//===----------------------------------------------------------------------===// // Subsystems. //===----------------------------------------------------------------------===// @@ -2513,6 +2579,12 @@ def : MnemonicAlias<"fnstsww", "fnstsw", "att">; def : MnemonicAlias<"fucomip", "fucompi", "att">; def : MnemonicAlias<"fwait", "wait">; +def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; +def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; +def : MnemonicAlias<"xsaveq", "xsave64", "att">; +def : MnemonicAlias<"xrstorq", "xrstor64", "att">; +def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; + class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, string VariantName> @@ -2700,28 +2772,28 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>; // this is compatible with what GAS does. def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"call *$dst", (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"call *$dst", (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"call *$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imulw $imm, $r", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm)>; -def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>; -def : InstAlias<"imull $imm, $r", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm)>; -def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>; -def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; -def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; +def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; @@ -2745,34 +2817,34 @@ def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; -def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; +def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>; +def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq $src, $dst", +def : InstAlias<"movq {$src, $dst|$dst, $src}", (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; -def : InstAlias<"movq $src, $dst", +def : InstAlias<"movq {$src, $dst|$dst, $src}", (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 9001fba..eaa7894 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -125,9 +125,9 @@ let Constraints = "$src1 = $dst" in { (bitconvert (load_mmx addr:$src2))))], itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), - (ins VR64:$src1, i32i8imm:$src2), + (ins VR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>, + [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>, Sched<[WriteVecShift]>; } } @@ -170,12 +170,12 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, /// PALIGN MMX instructions (require SSSE3). multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), - (ins VR64:$src1, VR64:$src2, i8imm:$src3), - !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + (ins VR64:$src1, VR64:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, Sched<[WriteShuffle]>; def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), - (ins VR64:$src1, i64mem:$src2, i8imm:$src3), + (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, @@ -220,23 +220,29 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", // Data Transfer Instructions def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, + [(set VR64:$dst, (x86mmx (scalar_to_vector GR32:$src)))], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; -let canFoldAsLoad = 1 in def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (scalar_to_vector (loadi32 addr:$src))))], IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; + +let Predicates = [HasMMX] in { + let AddedComplexity = 15 in + def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), + (MMX_MOVD64rr GR32:$src)>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), + (MMX_MOVD64rm addr:$src)>; +} + let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>, Sched<[WriteStore]>; -// Low word of MMX to GPR. -def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, - [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, @@ -248,16 +254,21 @@ def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), [(set VR64:$dst, (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>; + // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. let SchedRW = [WriteMove] in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), - "movd\t{$src, $dst|$dst, $src}", + "movd\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; @@ -268,6 +279,12 @@ def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), } } // SchedRW +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem, + (outs i64mem:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>; + let SchedRW = [WriteLoad] in { let canFoldAsLoad = 1 in def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), @@ -453,6 +470,13 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLQrm VR64:$src1, addr:$src2)>; + defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_mmx_psll_w, int_x86_mmx_pslli_w, MMX_SHIFT_ITINS>; @@ -463,6 +487,13 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_mmx_psll_q, int_x86_mmx_pslli_q, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLQrm VR64:$src1, addr:$src2)>; + defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_mmx_psra_w, int_x86_mmx_psrai_w, MMX_SHIFT_ITINS>; @@ -470,6 +501,11 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRAWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRADrm VR64:$src1, addr:$src2)>; + // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, MMX_INTALU_ITINS>; @@ -486,19 +522,19 @@ defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, MMX_INTALU_ITINS>; // -- Unpack Instructions -defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", int_x86_mmx_punpckhbw, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", int_x86_mmx_punpckhwd, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", int_x86_mmx_punpckhdq, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", int_x86_mmx_punpcklbw, MMX_UNPCK_L_ITINS>; -defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", int_x86_mmx_punpcklwd, MMX_UNPCK_L_ITINS>; defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", @@ -518,13 +554,13 @@ defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, MMX_PSHUF_ITINS>; def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, - (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2), + (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], IIC_MMX_PSHUF>, Sched<[WriteShuffle]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, - (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), + (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), @@ -559,27 +595,27 @@ let Constraints = "$src1 = $dst" in { // Extract / Insert def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, - (iPTR imm:$src2)))], + imm:$src2))], IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, - (outs VR64:$dst), - (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3), + (outs VR64:$dst), + (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32orGR64:$src2, (iPTR imm:$src3)))], + GR32orGR64:$src2, imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), - (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3), + (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), - (iPTR imm:$src3)))], + imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td index 47c5dc5..84119ad 100644 --- a/lib/Target/X86/X86InstrSGX.td +++ b/lib/Target/X86/X86InstrSGX.td @@ -17,8 +17,8 @@ // ENCLS - Execute an Enclave System Function of Specified Leaf Number def ENCLS : I<0x01, MRM_CF, (outs), (ins), - "encls", []>, TB, Requires<[HasSGX]>; + "encls", []>, TB; // ENCLU - Execute an Enclave User Function of Specified Leaf Number def ENCLU : I<0x01, MRM_D7, (outs), (ins), - "enclu", []>, TB, Requires<[HasSGX]>; + "enclu", []>, TB; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index cc896f0..d2929d2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -548,13 +548,13 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string base_opc, - string asm_opr> { + string asm_opr, Domain d = GenericDomain> { def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), [(set VR128:$dst, (vt (OpNode VR128:$src1, (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; + IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>; // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in @@ -565,49 +565,55 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, } multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, - X86MemOperand x86memop, string OpcodeStr> { + X86MemOperand x86memop, string OpcodeStr, + Domain d = GenericDomain> { // AVX defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, VEX_4V, VEX_LIG; def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, VEX, VEX_LIG, Sched<[WriteStore]>; // SSE1 & 2 let Constraints = "$src1 = $dst" in { defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $dst|$dst, $src2}">; + "\t{$src2, $dst|$dst, $src2}", d>; } def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, Sched<[WriteStore]>; } // Loading from memory automatically zeroing upper bits. multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_pat, string OpcodeStr> { + PatFrag mem_pat, string OpcodeStr, + Domain d = GenericDomain> { def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], - IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>; + IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>; def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], - IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>; + IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>; } -defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS; -defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD; +defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", + SSEPackedSingle>, XS; +defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", + SSEPackedDouble>, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { - defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS; + defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", + SSEPackedSingle>, XS; let AddedComplexity = 20 in - defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD; + defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", + SSEPackedDouble>, XD; } // Patterns @@ -809,7 +815,7 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, string asm, Domain d, OpndItins itins, bit IsReMaterializable = 1> { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, Sched<[WriteFShuffle]>; @@ -1332,6 +1338,8 @@ let Predicates = [HasAVX] in { (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSrm VR128:$src1, addr:$src2)>; + // VMOVHPD patterns + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1344,6 +1352,11 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE1] in { @@ -1357,6 +1370,8 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { + // MOVHPD patterns + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1369,6 +1384,11 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -1477,7 +1497,7 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm, Domain d, OpndItins itins> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, [], itins.rr, d>, Sched<[itins.Sched]>; let mayLoad = 1 in @@ -1488,7 +1508,7 @@ let neverHasSideEffects = 1 in { multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[WriteCvtI2F]>; @@ -1497,7 +1517,7 @@ let neverHasSideEffects = 1, Predicates = [UseAVX] in { (ins DstRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[WriteCvtI2FLd, ReadAfterLd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 } let Predicates = [UseAVX] in { @@ -1804,7 +1824,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], @@ -1869,7 +1889,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2191,7 +2211,7 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), // Convert Packed DW Integers to Packed Double FP let Predicates = [HasAVX] in { -let neverHasSideEffects = 1, mayLoad = 1 in +let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX, Sched<[WriteCvtI2FLd]>; @@ -2213,7 +2233,7 @@ def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), Sched<[WriteCvtI2F]>; } -let neverHasSideEffects = 1, mayLoad = 1 in +let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; @@ -2319,26 +2339,26 @@ let Predicates = [UseSSE2] in { multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, Operand CC, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, string asm_alt, - OpndItins itins> { + OpndItins itins, ImmLeaf immLeaf> { def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))], + (ld_frag addr:$src2), immLeaf:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], + (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; let mayLoad = 1 in def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], + (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], IIC_SSE_ALU_F32S_RM>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -2347,38 +2367,37 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S>, - XS, VEX_4V, VEX_LIG; + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S>, // same latency as 32 bit compare + SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, - XS; + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, + i8immZExt3>, XS; defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSE_ALU_F64S>, - XD; + SSE_ALU_F64S, i8immZExt3>, XD; } multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, - Intrinsic Int, string asm, OpndItins itins> { + Intrinsic Int, string asm, OpndItins itins, + ImmLeaf immLeaf> { def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, imm:$cc))], + VR128:$src, immLeaf:$cc))], itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - (load addr:$src), imm:$cc))], + (load addr:$src), immLeaf:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -2387,19 +2406,19 @@ let isCodeGenOnly = 1 in { // Aliases to match intrinsics which expect XMM operand(s). defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V; defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, // same latency as f32 + SSE_ALU_F32S, i8immZExt5>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S>, XS; + SSE_ALU_F32S, i8immZExt3>, XS; defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F64S>, + SSE_ALU_F64S, i8immZExt3>, XD; } } @@ -2473,26 +2492,28 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Operand CC, Intrinsic Int, string asm, - string asm_alt, Domain d, - OpndItins itins = SSE_ALU_F32P> { + string asm_alt, Domain d, ImmLeaf immLeaf, + PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { + let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], + [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], itins.rr, d>, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; + let mayLoad = 1 in def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; } @@ -2501,61 +2522,61 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, PS, VEX_4V; + SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V; defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, PD, VEX_4V; + SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V; defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, PS, VEX_4V, VEX_L; + SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, PD, VEX_4V, VEX_L; + SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle, SSE_ALU_F32P>, PS; + SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble, SSE_ALU_F64P>, PD; + SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; } let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; -def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; } let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; } @@ -2568,12 +2589,12 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, + (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffle]>; @@ -2729,24 +2750,6 @@ let Predicates = [HasAVX1Only] in { (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } -let Predicates = [HasAVX] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (VUNPCKLPDrr VR128:$src, VR128:$src)>; -} - -let Predicates = [UseSSE2] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (UNPCKLPDrr VR128:$src, VR128:$src)>; -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Extract Floating-Point Sign mask //===----------------------------------------------------------------------===// @@ -2838,7 +2841,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, ValueType OpVT128, ValueType OpVT256, OpndItins itins, bit IsCommutable = 0> { -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoVLX] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; @@ -2846,7 +2849,7 @@ let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, memopv2i64, i128mem, itins, IsCommutable, 1>; -let Predicates = [HasAVX2] in +let Predicates = [HasAVX2, NoVLX] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; @@ -2867,40 +2870,73 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, // SSE 1 & 2 - Logical Instructions //===----------------------------------------------------------------------===// -/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops -/// -multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, - SDNode OpNode, OpndItins itins> { +// Multiclass for scalars using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_scalar_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, + PS, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; + } +} + +let isCodeGenOnly = 1 in { + defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; + defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + + let isCommutable = 0 in + defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, + SSE_BIT_ITINS_P>; +} + +// Multiclass for vectors using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_vector_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, + VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, + VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, PD, VEX_4V; + } let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32, SSEPackedSingle, itins>, + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>, PS; - defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64, SSEPackedDouble, itins>, + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>, PD; } } -// Alias bitwise logical operations using SSE logical ops on packed FP values. let isCodeGenOnly = 1 in { - defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand, SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for, SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor, SSE_BIT_ITINS_P>; let isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, + defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn, SSE_BIT_ITINS_P>; } @@ -2908,6 +2944,7 @@ let isCodeGenOnly = 1 in { /// multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], @@ -2938,6 +2975,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)))], 0>, PD, VEX_4V; + } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, @@ -2993,6 +3031,7 @@ let Predicates = [HasAVX1Only] in { /// classes below multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins.s, 0>, PS, VEX_4V; @@ -3006,6 +3045,7 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; + } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, @@ -3081,10 +3121,9 @@ let isCodeGenOnly = 1 in { } // Patterns used to select SSE scalar fp arithmetic instructions from -// a scalar fp operation followed by a blend. +// either: // -// These patterns know, for example, how to select an ADDSS from a -// float add plus vector insert. +// (1) a scalar fp operation followed by a blend // // The effect is that the backend no longer emits unnecessary vector // insert instructions immediately after SSE scalar fp instructions @@ -3096,218 +3135,14 @@ let isCodeGenOnly = 1 in { // return A; // } // -// previously we generated: +// Previously we generated: // addss %xmm0, %xmm1 // movss %xmm1, %xmm0 -// -// we now generate: +// +// We now generate: // addss %xmm1, %xmm0 - -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; -} - -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is - // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When - // selecting SSE scalar single-precision fp arithmetic instructions, make - // sure that we correctly match them. - - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions. - - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -// Patterns used to select SSE scalar fp arithmetic instructions from -// a vector packed single/double fp operation followed by a vector insert. +// +// (2) a vector packed single/double fp operation followed by a vector insert // // The effect is that the backend converts the packed fp instruction // followed by a vector insert into a single SSE scalar fp instruction. @@ -3318,160 +3153,151 @@ let Predicates = [HasAVX] in { // return (__m128) {c[0], a[1], a[2], a[3]}; // } // -// previously we generated: +// Previously we generated: // addps %xmm0, %xmm1 // movss %xmm1, %xmm0 -// -// we now generate: +// +// We now generate: // addss %xmm1, %xmm0 -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; -} +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE1] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } + + // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via insertps + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - // from a packed double-precision fp instruction plus movsd. - - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; -} + } -let Predicates = [UseSSE41] in { - // With SSE4.1 we may see these operations using X86Blendi rather than - // X86Movs{s,d}. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + // Repeat everything for AVX, except for the movss + scalar combo... + // because that one shouldn't occur with AVX codegen? + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via insertps + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (iPTR 0))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } } -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions from a packed single precision fp instruction - // plus movss/movsd. - - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - // Also handle X86Blendi-based patterns. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; +defm : scalar_math_f32_patterns<fadd, "ADD">; +defm : scalar_math_f32_patterns<fsub, "SUB">; +defm : scalar_math_f32_patterns<fmul, "MUL">; +defm : scalar_math_f32_patterns<fdiv, "DIV">; + +multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE2] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // With SSE 4.1, blendi is preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // Repeat everything for AVX. + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } } +defm : scalar_math_f64_patterns<fadd, "ADD">; +defm : scalar_math_f64_patterns<fsub, "SUB">; +defm : scalar_math_f64_patterns<fmul, "MUL">; +defm : scalar_math_f64_patterns<fdiv, "DIV">; + + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -3518,103 +3344,106 @@ def SSE_RCPS : OpndItins< >; } -/// sse1_fp_unop_s - SSE1 unops in scalar form. -multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int, OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), - (ins FR32:$src1, FR32:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), - (ins FR32:$src1,f32mem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; +/// sse_fp_unop_s - SSE1 unops in scalar form +/// For the non-AVX defs, we need $src1 to be tied to $dst because +/// the HW instructions are 2 operand / destructive. +multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, Intrinsic Intr, + SDNode OpNode, OpndItins itins, Predicate target, + string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode RC:$src1))], itins.rr>, Sched<[itins.Sched]>, + Requires<[target]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>, + Requires<[target, OptForSize]>; + + let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } } -} - def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; - // For scalar unary operations, fold a load into the operation - // only in OptForSize mode. It eliminates an instruction, but it also - // eliminates a whole-register clobber (the load), so it introduces a - // partial register update condition. - def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>, - Sched<[itins.Sched.Folded]>; -} -} - -/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. -multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), - (ins FR32:$src1, FR32:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), - (ins FR32:$src1,f32mem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; + let Predicates = [target] in { + def : Pat<(vt (OpNode mem_cpat:$src)), + (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>; + // These are unary operations, but they are modeled as having 2 source operands + // because the high elements of the destination are unchanged in SSE. + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; + def : Pat<(Intr (load addr:$src)), + (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) + addr:$src), VR128))>; + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } } - def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; - // For scalar unary operations, fold a load into the operation - // only in OptForSize mode. It eliminates an instruction, but it also - // eliminates a whole-register clobber (the load), so it introduces a - // partial register update condition. - def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; - let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { - def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), - [], itins.rr>, Sched<[itins.Sched]>; - let mayLoad = 1, hasSideEffects = 0 in - def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), - [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, + Intrinsic Intr, SDNode OpNode, OpndItins itins, + Predicate target, string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rr>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in { + // todo: uncomment when all r_Int forms will be added to X86InstrInfo.cpp + //def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), + // (ins VR128:$src1, VR128:$src2), + // !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + // []>, Sched<[itins.Sched.Folded]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } } + + let Predicates = [target] in { + def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) + (ScalarVT (IMPLICIT_DEF)), RC:$src)>; + + def : Pat<(vt (OpNode mem_cpat:$src)), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), + mem_cpat:$src)>; + + // todo: use r_Int form when it will be ready + //def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) + // (VT (IMPLICIT_DEF)), VR128:$src)>; + def : Pat<(Intr VR128:$src), + (vt (COPY_TO_REGCLASS( + !cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), + (ScalarVT (COPY_TO_REGCLASS VR128:$src, RC))), VR128))>; + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } + let Predicates = [target, OptForSize] in + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; } /// sse1_fp_unop_p - SSE1 unops in packed form. @@ -3693,53 +3522,6 @@ let Predicates = [HasAVX] in { } // isCodeGenOnly = 1 } -/// sse2_fp_unop_s - SSE2 unops in scalar form. -multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F64Int, OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), - (ins FR64:$src1, FR64:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), - (ins FR64:$src1,f64mem:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, sdmem:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - } -} - - def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>, - Sched<[itins.Sched]>; - // See the comments in sse1_fp_unop_s for why this is OptForSize. - def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, - Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>, - Sched<[itins.Sched.Folded]>; -} -} - /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { @@ -3776,90 +3558,47 @@ let Predicates = [HasAVX] in { Sched<[itins.Sched.Folded]>; } +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, + ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + itins, UseSSE1, "SS">, XS; + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, + f32mem, ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + itins, HasAVX, "SS">, XS, VEX_4V, VEX_LIG; +} + +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, + sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, itins, UseSSE2, "SD">, XD; + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, + f64mem, sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, itins, HasAVX, "SD">, XD, VEX_4V, VEX_LIG; +} + // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, - SSE_SQRTSS>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTSD>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; -defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, int_x86_avx_rcp_ps_256, SSE_RCPP>; -let Predicates = [UseAVX] in { - def : Pat<(f32 (fsqrt FR32:$src)), - (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - def : Pat<(f64 (fsqrt FR64:$src)), - (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - - def : Pat<(f32 (X86frsqrt FR32:$src)), - (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - - def : Pat<(f32 (X86frcp FR32:$src)), - (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; -} -let Predicates = [UseAVX] in { - def : Pat<(int_x86_sse_sqrt_ss VR128:$src), - (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), - (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR64)), - VR128)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; -} - -let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), - (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), - (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse_rcp_ss VR128:$src), - (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), - (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; -} - -// Reciprocal approximations. Note that these typically require refinement -// in order to obtain suitable precision. -let Predicates = [UseSSE1] in { - def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), - (RSQRTSSr_Int VR128:$src, VR128:$src)>; - def : Pat<(int_x86_sse_rcp_ss VR128:$src), - (RCPSSr_Int VR128:$src, VR128:$src)>; -} - // There is no f64 version of the reciprocal approximation instructions. //===----------------------------------------------------------------------===// @@ -3974,14 +3713,14 @@ let SchedRW = [WriteLoad] in { // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], - IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; + IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>; } let SchedRW = [WriteNop] in { // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. -def PAUSE : I<0x90, RawFrm, (outs), (ins), - "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, +def PAUSE : I<0x90, RawFrm, (outs), (ins), + "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS, Requires<[HasSSE2]>; } @@ -3989,7 +3728,7 @@ let SchedRW = [WriteFence] in { // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, - TB, Requires<[HasSSE1]>; + PS, Requires<[HasSSE1]>; def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, TB, Requires<[HasSSE2]>; @@ -4013,12 +3752,14 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; -def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>; -def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>, Sched<[WriteStore]>; +let Predicates = [UseSSE1] in { +def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; +def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; +} //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -4026,7 +3767,7 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let neverHasSideEffects = 1, SchedRW = [WriteMove] in { +let hasSideEffects = 0, SchedRW = [WriteMove] in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX; @@ -4061,7 +3802,7 @@ def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteLoad] in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, VEX; @@ -4078,7 +3819,7 @@ let Predicates = [HasAVX] in { } } -let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, @@ -4098,7 +3839,7 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), } let SchedRW = [WriteMove] in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -4119,7 +3860,7 @@ def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), } // SchedRW let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteLoad] in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], @@ -4131,7 +3872,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), XS, Requires<[UseSSE2]>; } -let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], @@ -4211,7 +3952,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, RegisterClass RC, ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, - ShiftOpndItins itins, + PatFrag ld_frag, ShiftOpndItins itins, bit Is2Addr = 1> { // src2 is always 128-bit def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), @@ -4227,10 +3968,10 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode RC:$src1, - (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>, + (bc_frag (ld_frag addr:$src2)))))], itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), @@ -4338,45 +4079,45 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, let Predicates = [HasAVX] in { defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; def VPSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. } @@ -4384,45 +4125,45 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { let Predicates = [HasAVX2] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR256, v4i64, v2i64, bc_v2i64, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR256, v4i64, v2i64, bc_v2i64, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, - (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, - (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, + (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; def VPSRLDQYri : PDIi8<0x73, MRM3r, - (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, - (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, + (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. } @@ -4430,85 +4171,58 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 128-bit logical shifts. def PSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; def PSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; -} - -let Predicates = [HasAVX2] in { - def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2), - (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), - (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; } let Predicates = [UseSSE2] in { - def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; } //===---------------------------------------------------------------------===// @@ -4537,14 +4251,14 @@ multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, SDNode OpNode> { let Predicates = [HasAVX] in { def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, @@ -4555,14 +4269,14 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src1, i8imm:$src2), + (ins i256mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -4573,14 +4287,14 @@ let Predicates = [HasAVX2] in { let Predicates = [UseSSE2] in { def ri : Ii8<0x70, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; def mi : Ii8<0x70, MRMSrcMem, - (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, @@ -4616,7 +4330,7 @@ let Predicates = [UseSSE2] in { let ExeDomain = SSEPackedInt in { multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4634,7 +4348,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OutVT (OpNode VR128:$src1, - (bc_frag (memopv2i64 addr:$src2)))))]>, + (bc_frag (ld_frag addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4653,13 +4367,13 @@ multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OutVT (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2)))))]>, + (bc_frag (loadv4i64 addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4677,7 +4391,7 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OutVT (OpNode VR128:$src1, - (bc_frag (memopv2i64 addr:$src2)))))]>, + (bc_frag (ld_frag addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4696,20 +4410,20 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OutVT (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2)))))]>, + (bc_frag (loadv4i64 addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -4726,16 +4440,16 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, - bc_v8i16>; + bc_v8i16, memopv2i64>; let Predicates = [HasSSE41] in defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, - bc_v4i32>; + bc_v4i32, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -4745,7 +4459,8 @@ let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedInt in { multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> { + SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4759,8 +4474,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OpNode VR128:$src1, - (bc_frag (memopv2i64 - addr:$src2))))], + (bc_frag (ld_frag addr:$src2))))], IIC_SSE_UNPCK>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4776,28 +4490,28 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2))))]>, + (bc_frag (loadv4i64 addr:$src2))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, - bc_v16i8, 0>, VEX_4V; + bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - bc_v2i64, 0>, VEX_4V; + bc_v2i64, loadv2i64, 0>, VEX_4V; defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, - bc_v16i8, 0>, VEX_4V; + bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, - bc_v2i64, 0>, VEX_4V; + bc_v2i64, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -4822,22 +4536,22 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, - bc_v16i8>; + bc_v16i8, memopv2i64>; defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, - bc_v2i64>; + bc_v2i64, memopv2i64>; defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, - bc_v16i8>; + bc_v16i8, memopv2i64>; defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, - bc_v2i64>; + bc_v2i64, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -4849,7 +4563,7 @@ let ExeDomain = SSEPackedInt in { multiclass sse2_pinsrw<bit Is2Addr = 1> { def rri : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, - GR32orGR64:$src2, i32i8imm:$src3), + GR32orGR64:$src2, u8imm:$src3), !if(Is2Addr, "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -4858,7 +4572,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; def rmi : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, - i16mem:$src2, i32i8imm:$src3), + i16mem:$src2, u8imm:$src3), !if(Is2Addr, "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -4871,13 +4585,13 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { // Extract let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, PD, VEX, Sched<[WriteShuffle]>; def PEXTRWri : PDIi8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))], IIC_SSE_PEXTRW>, @@ -4974,6 +4688,10 @@ def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4995,6 +4713,10 @@ def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", @@ -5081,6 +4803,15 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), IIC_SSE_MOVD_ToGP>; } //SchedRW +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), + (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 // @@ -5213,7 +4944,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", // Move Quadword Int to Packed Quadword Int // -let SchedRW = [WriteLoad] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in { def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -5225,12 +4956,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (scalar_to_vector (loadi64 addr:$src))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix -} // SchedRW +} // ExeDomain, SchedRW //===---------------------------------------------------------------------===// // Move Packed Quadword Int to Quadword Int // -let SchedRW = [WriteStore] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), @@ -5241,7 +4972,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; -} // SchedRW +} // ExeDomain, SchedRW // For disassembler only let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, @@ -5262,7 +4993,7 @@ let Predicates = [UseSSE2] in def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), (MOVPQI2QImr addr:$dst, VR128:$src)>; -let isCodeGenOnly = 1, AddedComplexity = 20 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in { def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -5278,7 +5009,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; -} +} // ExeDomain, isCodeGenOnly, AddedComplexity let Predicates = [UseAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), @@ -5304,7 +5035,7 @@ def : Pat<(v4i64 (X86vzload addr:$src)), // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. // -let SchedRW = [WriteVecLogic] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -5317,9 +5048,9 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, XS, Requires<[UseSSE2]>; -} // SchedRW +} // ExeDomain, SchedRW -let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -5335,7 +5066,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; } -} // isCodeGenOnly, SchedRW +} // ExeDomain, isCodeGenOnly, SchedRW let AddedComplexity = 20 in { let Predicates = [UseAVX] in { @@ -5414,10 +5145,10 @@ let Predicates = [UseSSE3] in { //===---------------------------------------------------------------------===// multiclass sse3_replicate_dfp<string OpcodeStr> { -let neverHasSideEffects = 1 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; + [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, @@ -5514,7 +5245,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, OpndItins itins, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : I<0xD0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -5527,62 +5258,62 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>, + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V; + f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V; + f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem, SSE_ALU_F32P>, XD; + f128mem, SSE_ALU_F32P, memopv4f32>, XD; let ExeDomain = SSEPackedDouble in defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, - f128mem, SSE_ALU_F64P>, PD; + f128mem, SSE_ALU_F64P, memopv2f64>, PD; } // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))), (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))), (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))), (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))), (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))), (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))), (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } @@ -5592,7 +5323,8 @@ let Predicates = [UseSSE3] in { // Horizontal ops multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -5604,11 +5336,12 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -5620,41 +5353,45 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, loadv4f32, 0>, VEX_4V; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, loadv4f32, 0>, VEX_4V; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, 0>, VEX_4V, VEX_L; + X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, 0>, VEX_4V, VEX_L; + X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, loadv2f64, 0>, VEX_4V; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, loadv2f64, 0>, VEX_4V; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, 0>, VEX_4V, VEX_L; + X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, 0>, VEX_4V, VEX_L; + X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in { - defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; - defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, + memopv4f32>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, + memopv4f32>; } let ExeDomain = SSEPackedDouble in { - defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; - defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, + memopv2f64>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, + memopv2f64>; } } @@ -5664,8 +5401,8 @@ let Constraints = "$src1 = $dst" in { /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. -multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128> { +multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag> { def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -5677,7 +5414,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, + (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>; } @@ -5695,7 +5432,7 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 - (bitconvert (memopv4i64 addr:$src))))]>, + (bitconvert (loadv4i64 addr:$src))))]>, Sched<[WriteVecALULd]>; } @@ -5710,12 +5447,12 @@ def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; let Predicates = [HasAVX] in { - defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", - int_x86_ssse3_pabs_b_128>, VEX; - defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", - int_x86_ssse3_pabs_w_128>, VEX; - defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", - int_x86_ssse3_pabs_d_128>, VEX; + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128, + loadv2i64>, VEX; + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128, + loadv2i64>, VEX; + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128, + loadv2i64>, VEX; def : Pat<(xor (bc_v2i64 (v16i1sextv16i8)), @@ -5753,12 +5490,12 @@ let Predicates = [HasAVX2] in { (VPABSDrr256 VR256:$src)>; } -defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", - int_x86_ssse3_pabs_b_128>; -defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", - int_x86_ssse3_pabs_w_128>; -defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", - int_x86_ssse3_pabs_d_128>; +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128, + memopv2i64>; +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128, + memopv2i64>; +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128, + memopv2i64>; let Predicates = [HasSSSE3] in { def : Pat<(xor @@ -5830,7 +5567,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, OpndItins itins, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { let isCommutable = 1 in def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -5846,7 +5583,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>, + (bitconvert (ld_frag addr:$src2))))]>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -5895,17 +5632,17 @@ let isCommutable = 0 in { SSE_PSHUFB, 0>, VEX_4V; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SSE_PHADDSUBSW, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SSE_PHADDSUBSW, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", int_x86_ssse3_pmadd_ub_sw_128, - SSE_PMADD, 0>, VEX_4V; + SSE_PMADD, loadv2i64, 0>, VEX_4V; } defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW, 0>, VEX_4V; + SSE_PMULHRSW, loadv2i64, 0>, VEX_4V; } let ImmT = NoImm, Predicates = [HasAVX2] in { @@ -5970,16 +5707,17 @@ let isCommutable = 0 in { memopv2i64, i128mem, SSE_PSHUFB>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, - SSE_PHADDSUBSW>; + SSE_PHADDSUBSW, memopv2i64>; defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, - SSE_PHADDSUBSW>; + SSE_PHADDSUBSW, memopv2i64>; defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", - int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>; + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, memopv2i64>; } defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW>; + SSE_PMULHRSW, memopv2i64>; } //===---------------------------------------------------------------------===// @@ -5987,9 +5725,9 @@ defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", //===---------------------------------------------------------------------===// multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -5997,7 +5735,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6007,15 +5745,15 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { } multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2, i8imm:$src3), + (ins VR256:$src1, i256mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteShuffleLd, ReadAfterLd]>; @@ -6094,552 +5832,271 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, // SSE4.1 - Packed Move with Sign/Zero Extend //===----------------------------------------------------------------------===// -multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], - itins.rm>, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId (load addr:$src)))]>, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", - int_x86_sse41_pmovsxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", - int_x86_sse41_pmovsxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", - int_x86_sse41_pmovsxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", - int_x86_sse41_pmovzxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", - int_x86_sse41_pmovzxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", - int_x86_sse41_pmovzxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", - int_x86_avx2_pmovsxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", - int_x86_avx2_pmovsxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", - int_x86_avx2_pmovsxdq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", - int_x86_avx2_pmovzxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", - int_x86_avx2_pmovzxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", - int_x86_avx2_pmovzxdq, - WriteShuffle>, VEX, VEX_L; -} - -defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVZXDQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVZXDQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), +multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, + RegisterClass OutRC, RegisterClass InRC, + OpndItins itins> { + def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, + [], itins.rr>, Sched<[itins.Sched]>; - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], - itins.rm>, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, - (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", - int_x86_avx2_pmovsxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", - int_x86_avx2_pmovsxwq, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", - int_x86_avx2_pmovzxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", - int_x86_avx2_pmovzxwq, WriteShuffle>, - VEX, VEX_L; -} - -defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (VPMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (VPMOVZXWQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (PMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (PMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (PMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (PMOVZXWQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId, - X86FoldableSchedWrite Sched> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), + def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId (bitconvert - (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, - Sched<[Sched.Folded]>; + [], + itins.rm>, Sched<[itins.Sched.Folded]>; } -multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId (bitconvert - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>, VEX; -defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>, VEX; -} -let Predicates = [HasAVX2] in { -defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq, - WriteShuffle>, VEX, VEX_L; +multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp, + OpndItins SSEItins, OpndItins AVXItins, + OpndItins AVX2Itins> { + defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; + let Predicates = [HasAVX] in + defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, + VR128, VR128, AVXItins>, VEX; + let Predicates = [HasAVX2] in + defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, + VR256, VR128, AVX2Itins>, VEX, VEX_L; +} + +multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp> { + defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; + defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), + !strconcat("pmovzx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; +} + +defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; +defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; +defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; + +defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; +defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; + +defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { + // Register-Register patterns + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; + + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; + + // On AVX2, we also support 256bit inputs. + // FIXME: remove these patterns when the old shuffle lowering goes away. + def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), + (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + // Simple Register-Memory patterns + def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + + // AVX2 Register-Memory patterns + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; } -defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>; -defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>; let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), - (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXWDYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXDQYrm addr:$src)>; - - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQYrm addr:$src)>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +} + +// SSE4.1/AVX patterns. +multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp, PatFrag ExtLoad16> { + def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; + + def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; + + def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; + + def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; } let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; - - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; -} - -let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))), - (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; -} - -let Predicates = [HasAVX] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (VPMOVZXDQrm addr:$src)>; - - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (PMOVZXDQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// @@ -6649,20 +6106,20 @@ let Predicates = [UseSSE41] in { /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, Sched<[WriteShuffle]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i8mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6675,19 +6132,19 @@ defm PEXTRB : SS41I_extract8<0x14, "pextrb">; multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[WriteShuffle]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i16mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6699,7 +6156,7 @@ defm PEXTRW : SS41I_extract16<0x15, "pextrw">; /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32:$dst, @@ -6707,7 +6164,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { Sched<[WriteShuffle]>; let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i32mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v4i32 VR128:$src1), imm:$src2), @@ -6722,7 +6179,7 @@ defm PEXTRD : SS41I_extract32<0x16, "pextrd">; /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR64:$dst, @@ -6730,7 +6187,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { Sched<[WriteShuffle]>, REX_W; let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i64mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v2i64 VR128:$src1), imm:$src2), @@ -6747,7 +6204,7 @@ defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, @@ -6755,7 +6212,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, itins.rr>, Sched<[WriteFBlend]>; let SchedRW = [WriteFBlendLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins f32mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), @@ -6786,7 +6243,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6795,7 +6252,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i8mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6812,7 +6269,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR32:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6821,7 +6278,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i32mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6838,7 +6295,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR64:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6847,7 +6304,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i64mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6869,7 +6326,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6878,7 +6335,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src2, i8imm:$src3), + (ins VR128:$src1, f32mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6932,7 +6389,7 @@ let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg def PSr : SS4AIi8<opcps, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], @@ -6940,7 +6397,7 @@ let ExeDomain = SSEPackedSingle in { // Vector intrinsic operation, mem def PSm : SS4AIi8<opcps, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, @@ -6951,7 +6408,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Vector intrinsic operation, reg def PDr : SS4AIi8<opcpd, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], @@ -6959,7 +6416,7 @@ let ExeDomain = SSEPackedDouble in { // Vector intrinsic operation, mem def PDm : SS4AIi8<opcpd, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, @@ -6976,7 +6433,7 @@ let ExeDomain = GenericDomain in { // Operation, reg. let hasSideEffects = 0 in def SSr : SS4AIi8<opcss, MRMSrcReg, - (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6987,7 +6444,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SSr_Int : SS4AIi8<opcss, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6998,7 +6455,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, mem. def SSm : SS4AIi8<opcss, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7011,7 +6468,7 @@ let ExeDomain = GenericDomain in { // Operation, reg. let hasSideEffects = 0 in def SDr : SS4AIi8<opcsd, MRMSrcReg, - (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7022,7 +6479,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7033,7 +6490,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, mem. def SDm : SS4AIi8<opcsd, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7059,7 +6516,9 @@ let Predicates = [HasAVX] in { defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; +} +let Predicates = [UseAVX] in { def : Pat<(ffloor FR32:$src), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; def : Pat<(f64 (ffloor FR64:$src)), @@ -7080,7 +6539,9 @@ let Predicates = [HasAVX] in { (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; def : Pat<(f64 (ftrunc FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; +} +let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x1))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), @@ -7284,7 +6745,7 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, + Intrinsic IntId128, PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -7295,7 +6756,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId128 (bitconvert (memopv2i64 addr:$src))))]>, + (IntId128 (bitconvert (ld_frag addr:$src))))]>, Sched<[Sched.Folded]>; } @@ -7303,53 +6764,12 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw, + int_x86_sse41_phminposuw, loadv2i64, WriteVecIMul>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw, + int_x86_sse41_phminposuw, memopv2i64, WriteVecIMul>; -/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator -multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1, - OpndItins itins = DEFAULT_ITINS> { - let isCommutable = 1 in - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], - itins.rr>, Sched<[itins.Sched]>; - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))], - itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; -} - -/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator -multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId256, - X86FoldableSchedWrite Sched> { - let isCommutable = 1 in - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, - Sched<[Sched]>; - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, - Sched<[Sched.Folded, ReadAfterLd]>; -} - - /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, @@ -7398,7 +6818,7 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { let isCommutable = 0 in defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7429,7 +6849,7 @@ let Predicates = [HasAVX] in { SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { let isCommutable = 0 in defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7483,7 +6903,7 @@ let Constraints = "$src1 = $dst" in { SSE_INTMUL_ITINS_P, 1>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; @@ -7493,10 +6913,10 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, + loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; } @@ -7514,7 +6934,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7523,7 +6943,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7580,13 +7000,13 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in { - defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } + defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7734,7 +7154,7 @@ let Predicates = [UseAVX] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>; + (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; @@ -7769,7 +7189,7 @@ let Predicates = [UseSSE41] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; } @@ -7909,141 +7329,149 @@ let Constraints = "$src1 = $dst" in //===----------------------------------------------------------------------===// // Packed Compare Implicit Length Strings, Return Mask -multiclass pseudo_pcmpistrm<string asm> { +multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, imm:$src3))]>; def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, - (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } let Defs = [EFLAGS], usesCustomInserter = 1 in { - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, + Requires<[UseSSE42]>; } multiclass pcmpistrm_SS42AI<string asm> { def rr : SS42AI<0x62, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrM]>; let mayLoad = 1 in def rm :SS42AI<0x62, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; } -let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { +let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; } // Packed Compare Explicit Length Strings, Return Mask -multiclass pseudo_pcmpestrm<string asm> { +multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, - (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>; + (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpestrm<string asm> { def rr : SS42AI<0x60, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrM]>; let mayLoad = 1 in def rm : SS42AI<0x60, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; } -let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; } // Packed Compare Implicit Length Strings, Return Index -multiclass pseudo_pcmpistri<string asm> { +multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, - (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } let Defs = [EFLAGS], usesCustomInserter = 1 in { - defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>; - defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>; + defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpistri<string asm> { def rr : SS42AI<0x63, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrI]>; let mayLoad = 1 in def rm : SS42AI<0x63, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; } -let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { +let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; } // Packed Compare Explicit Length Strings, Return Index -multiclass pseudo_pcmpestri<string asm> { +multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), [(set GR32:$dst, EFLAGS, (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX, + (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>; - defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>; + defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpestri<string asm> { def rr : SS42AI<0x61, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrI]>; let mayLoad = 1 in def rm : SS42AI<0x61, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; } -let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; @@ -8123,13 +7551,13 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, (i8 imm:$src3)))]>, TA; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, @@ -8157,8 +7585,8 @@ def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", // AES-NI Instructions //===----------------------------------------------------------------------===// -multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1> { +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag, bit Is2Addr = 1> { def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -8172,31 +7600,31 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, + (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, Sched<[WriteAESDecEncLd, ReadAfterLd]>; } // Perform One Round of an AES Encryption/Decryption Flow let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, 0>, VEX_4V; + int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, 0>, VEX_4V; + int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, 0>, VEX_4V; + int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, 0>, VEX_4V; + int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", - int_x86_aesni_aesenc>; + int_x86_aesni_aesenc, memopv2i64>; defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", - int_x86_aesni_aesenclast>; + int_x86_aesni_aesenclast, memopv2i64>; defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", - int_x86_aesni_aesdec>; + int_x86_aesni_aesdec, memopv2i64>; defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", - int_x86_aesni_aesdeclast>; + int_x86_aesni_aesdeclast, memopv2i64>; } // Perform the AES InvMixColumn Transformation @@ -8227,26 +7655,26 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), // AES Round Key Generation Assist let Predicates = [HasAVX, HasAES] in { def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, Sched<[WriteAESKeyGen]>, VEX; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGenLd]>, VEX; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, Sched<[WriteAESKeyGen]>; def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, @@ -8257,15 +7685,16 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), //===----------------------------------------------------------------------===// // AVX carry-less Multiplication instructions +let isCommutable = 1 in def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, Sched<[WriteCLMul]>; def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (loadv2i64 addr:$src2), imm:$src3))]>, @@ -8273,15 +7702,16 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), // Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { +let isCommutable = 1 in def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), imm:$src3))], @@ -8320,7 +7750,7 @@ let Predicates = [HasSSE4A] in { let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), - (ins VR128:$src, i8imm:$len, i8imm:$idx), + (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, imm:$idx))]>, PD; @@ -8331,7 +7761,7 @@ def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), VR128:$mask))]>, PD; def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), + (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, VR128:$src2, imm:$len, imm:$idx))]>, XD; @@ -8422,14 +7852,14 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values // -let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR128:$src2, i8imm:$src3), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f128mem:$src2, i8imm:$src3), + (ins VR256:$src1, f128mem:$src2, u8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } @@ -8496,14 +7926,14 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // -let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteFShuffle]>, VEX, VEX_L; let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR256:$src1, i8imm:$src2), + (ins f128mem:$dst, VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteStore]>, VEX, VEX_L; } @@ -8624,15 +8054,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, Sched<[WriteFShuffleLd, ReadAfterLd]>; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), - (ins x86memop_f:$src1, i8imm:$src2), + (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8689,13 +8119,13 @@ def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), // let ExeDomain = SSEPackedSingle in { def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$src3))))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L, @@ -8756,7 +8186,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (Int VR128:$src))]>, T8PD, VEX, Sched<[WriteCvtF2F]>; - let neverHasSideEffects = 1, mayLoad = 1 in + let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, Sched<[WriteCvtF2FLd]>; @@ -8764,14 +8194,14 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), - (ins RC:$src1, i32i8imm:$src2), + (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in def mr : Ii8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), + (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, TAPD, VEX; } @@ -8814,13 +8244,13 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, @@ -9061,14 +8491,14 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src1, i8imm:$src2), + (ins i256mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -9087,13 +8517,13 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), (i8 imm:$src3)))]>, @@ -9122,14 +8552,14 @@ def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values // -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR128:$src2, i8imm:$src3), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i128mem:$src2, i8imm:$src3), + (ins VR256:$src1, i128mem:$src2, u8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } @@ -9177,14 +8607,14 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), // VEXTRACTI128 - Extract packed integer values // def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; -let neverHasSideEffects = 1, mayStore = 1 in +let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR256:$src1, i8imm:$src2), + (ins i128mem:$dst, VR256:$src1, u8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteStore]>, VEX, VEX_L; @@ -9260,6 +8690,115 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), + (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), + (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), + (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), + (bc_v8f32 (v8i32 immAllZerosV)))), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), + (bc_v4f32 (v4i32 immAllZerosV)))), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), + (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (v4f64 immAllZerosV))), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (bc_v4i64 (v8i32 immAllZerosV)))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), + (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), + (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (v2f64 immAllZerosV))), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (bc_v2i64 (v4i32 immAllZerosV)))), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), + VR128:$mask)>; //===----------------------------------------------------------------------===// // Variable Bit Shifts diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index d0bb523..c706d43 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -49,6 +49,7 @@ def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; +} // isConvertibleToThreeAddress = 1 // NOTE: We don't include patterns for shifts of a register by one, because // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). @@ -62,7 +63,6 @@ def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), "shl{q}\t$dst", [], IIC_SR>; } // hasSideEffects = 0 -} // isConvertibleToThreeAddress = 1 } // Constraints = "$src = $dst", SchedRW @@ -289,11 +289,11 @@ def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), "sar{w}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize16; -def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), "sar{l}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize32; -def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; @@ -347,7 +347,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - + def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), @@ -381,7 +381,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - + def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), @@ -397,7 +397,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; - + def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), @@ -493,7 +493,7 @@ def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], @@ -600,7 +600,7 @@ def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], @@ -635,11 +635,11 @@ def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize16; -def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize32; -def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; @@ -688,19 +688,19 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { -def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize16; -def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize16; -def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], @@ -710,58 +710,58 @@ def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB, OpSize32; -def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], - IIC_SHD64_REG_CL>, + IIC_SHD64_REG_CL>, TB; -def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], - IIC_SHD64_REG_CL>, + IIC_SHD64_REG_CL>, TB; } let isCommutable = 1 in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, - (outs GR16:$dst), + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, - (outs GR16:$dst), + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, - (outs GR32:$dst), + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, - (outs GR32:$dst), + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, - (outs GR64:$dst), + (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, - (outs GR64:$dst), + (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, @@ -789,7 +789,7 @@ def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32; - + def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), @@ -807,7 +807,7 @@ def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD16_MEM_IM>, TB, OpSize16; -def SHRD16mri8 : Ii8<0xAC, MRMDestMem, +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, @@ -822,7 +822,7 @@ def SHLD32mri8 : Ii8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD32_MEM_IM>, TB, OpSize32; -def SHRD32mri8 : Ii8<0xAC, MRMDestMem, +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, @@ -837,7 +837,7 @@ def SHLD64mri8 : RIi8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; -def SHRD64mri8 : RIi8<0xAC, MRMDestMem, +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, @@ -859,7 +859,7 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{ }]>; multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShift]>; @@ -872,7 +872,7 @@ let neverHasSideEffects = 1 in { } multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, VEX_4VOp3, Sched<[WriteShift]>; diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 8cabdd0..0350566 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -38,9 +38,6 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))], IIC_INT3>; } // SchedRW -def : Pat<(debugtrap), - (INT3)>; - // The long form of "int $3" turns into int3 as a size optimization. // FIXME: This doesn't work because InstAlias can't match immediate constants. //def : InstAlias<"int\t$3", (INT3)>; @@ -71,6 +68,10 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, Requires<[In64BitMode]>; } // SchedRW +def : Pat<(debugtrap), + (INT3)>, Requires<[NotPS4]>; +def : Pat<(debugtrap), + (INT (i8 0x41))>, Requires<[IsPS4]>; //===----------------------------------------------------------------------===// // Input/Output Instructions. @@ -207,7 +208,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), let SchedRW = [WriteSystem] in { def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; -def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize16; def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), @@ -215,14 +216,14 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), OpSize16; // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. -def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize32; def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize32; // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. -def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; @@ -240,7 +241,7 @@ def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize32; def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; @@ -260,7 +261,7 @@ def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; - + def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, OpSize16, Requires<[Not64BitMode]>; @@ -347,31 +348,31 @@ def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; - + def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; - + def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; - + def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; - + def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; - + def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; @@ -408,7 +409,7 @@ def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), "sldt{w}\t$dst", [], IIC_SLDT>, TB; def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB; - + // LLDT is not interpreted specially in 64-bit mode because there is no sign // extension. def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), @@ -437,19 +438,21 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), //===----------------------------------------------------------------------===// // Specialized register support let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; +let Defs = [EAX, EDX], Uses = [ECX] in def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; let Defs = [RAX, RDX], Uses = [ECX] in def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, TB; -def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; -def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB; // no m form encodable; use SMSW16m -def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), "smsw{q}\t$dst", [], IIC_SMSW>, TB; // For memory operands, there is only a 16-bit form @@ -485,15 +488,28 @@ let Uses = [RDX, RAX] in { def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), "xsave\t$dst", []>, TB; def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), "xrstor\t$dst", []>, TB; def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>; def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt\t$dst", []>, TB; + "xsaveopt\t$dst", []>, PS; def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>; + + def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors\t$dst", []>, TB; + def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>; + def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), + "xsavec\t$dst", []>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), + "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>; + def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), + "xsaves\t$dst", []>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), + "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>; } } // SchedRW @@ -559,7 +575,13 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), //===----------------------------------------------------------------------===// // SMAP Instruction -let Predicates = [HasSMAP], Defs = [EFLAGS] in { +let Defs = [EFLAGS] in { def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; } + +//===----------------------------------------------------------------------===// +// SMX Instruction +let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; +} diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 4940efc..7267d75 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -23,9 +23,12 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, Requires<[HasRTM]>; -let isBranch = 1, isTerminator = 1, Defs = [EAX] in -def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst), - "xbegin\t$dst", []>, Requires<[HasRTM]>; +let isBranch = 1, isTerminator = 1, Defs = [EAX] in { +def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst), + "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>; +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), + "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>; +} def XEND : I<0x01, MRM_D5, (outs), (ins), "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 45e2ff0..8455b8d 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -20,21 +20,23 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; -defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; -defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; -defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; -defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; -defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; -defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; -defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; -defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; -defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; -defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; -defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; -defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; -defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; -defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} // Scalar load 2 addr operand instructions multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, @@ -47,11 +49,6 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; } -defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32>; -defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64>; - multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -62,9 +59,6 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; -defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; - multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), @@ -75,8 +69,19 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L; } -defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>; -defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>; +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +} multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), @@ -87,28 +92,30 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>, XOP_4V, VEX_W; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>, + (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>, XOP_4VOp3; } -defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; -defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; -defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; -defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; -defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; -defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; -defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; -defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; -defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; -defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; -defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; -defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +let ExeDomain = SSEPackedInt in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), @@ -119,16 +126,19 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { (ins i128mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, XOP; + (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; } -defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; -defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; -defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; -defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +let ExeDomain = SSEPackedInt in { + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +} // Instruction where second source can be memory, but third must be register multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { + let isCommutable = 1 in def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -140,48 +150,66 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), VR128:$src3))]>, XOP_4V, VEX_I8IMM; } -defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; -defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; -defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; -defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; -defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; -defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; -defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; -defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; -defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; -defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; -defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; -defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} // Instruction where second source can be memory, third must be imm8 -multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { + let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>, + (ins VR128:$src1, VR128:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>, XOP_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), - imm:$src3))]>, XOP_4V; + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + i8immZExt3:$cc))]>, XOP_4V; + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + let mayLoad = 1 in + def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + } } -defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; -defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; -defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; -defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; -defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; -defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; -defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; -defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; +let ExeDomain = SSEPackedInt in { // SSE integer instructions + defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; + defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; + defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; + defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; + defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; + defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +} // Instruction where either second or third source can be memory multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { @@ -197,20 +225,22 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - (bitconvert (memopv2i64 addr:$src3))))]>, + (bitconvert (loadv2i64 addr:$src3))))]>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4; def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), VR128:$src3))]>, XOP_4V, VEX_I8IMM; } -defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; -defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), @@ -225,19 +255,20 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, - (bitconvert (memopv4i64 addr:$src3))))]>, + (bitconvert (loadv4i64 addr:$src3))))]>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)), + (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)), VR256:$src3))]>, XOP_4V, VEX_I8IMM, VEX_L; } -defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let ExeDomain = SSEPackedInt in + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { @@ -282,8 +313,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, VEX_L; } -defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, - int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>; -defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>; +let ExeDomain = SSEPackedDouble in + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + +let ExeDomain = SSEPackedSingle in + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index d252f72..e436811 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -20,8 +20,9 @@ enum IntrinsicType { INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, - CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM + CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND }; struct IntrinsicData { @@ -51,7 +52,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), - + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), @@ -60,7 +61,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - + X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH, @@ -69,7 +70,55 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm), X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), - + + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -78,7 +127,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, @@ -87,7 +136,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm), X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), - + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), @@ -97,7 +146,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), - + X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0), X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0), X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), @@ -122,6 +171,12 @@ static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_permd, INTR_TYPE_2OP, X86ISD::VPERMV, 0), + X86_INTRINSIC_DATA(avx2_permps, INTR_TYPE_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -138,27 +193,79 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -171,6 +278,64 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + + X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + + X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), @@ -195,12 +360,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), @@ -215,27 +400,118 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_512, FMA_OP_MASK, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_512, FMA_OP_MASK, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_512, FMA_OP_MASK, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_512, FMA_OP_MASK, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -266,6 +542,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), + X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, X86ISD::SMAX, 0), X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, X86ISD::SMAX, 0), X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, X86ISD::UMAX, 0), @@ -274,12 +551,27 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, X86ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), @@ -290,7 +582,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), - X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0) + X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) }; /* diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 4e0d594..6af59d4 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -74,11 +74,11 @@ namespace llvm { X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {} void - X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) { + X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) { + MF = &F; CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *TM.getSubtargetImpl()->getInstrInfo(), - *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(), - MF.getContext())); + *MF->getSubtarget().getInstrInfo(), *MF->getSubtarget().getRegisterInfo(), + MF->getSubtarget(), MF->getContext())); } void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, @@ -100,7 +100,7 @@ namespace llvm { if (InShadow && CurrentShadowSize < RequiredShadowSize) { InShadow = false; EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, - TM.getSubtarget<X86Subtarget>().is64Bit(), STI); + MF->getSubtarget<X86Subtarget>().is64Bit(), STI); } } @@ -112,8 +112,8 @@ namespace llvm { X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) -: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), - MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} + : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()), + AsmPrinter(asmprinter) {} MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>(); @@ -124,7 +124,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout(); + const DataLayout *DL = TM.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; @@ -390,9 +390,8 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, Inst.addOperand(Seg); } -static unsigned getRetOpcode(const X86Subtarget &Subtarget) -{ - return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; +static unsigned getRetOpcode(const X86Subtarget &Subtarget) { + return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; } void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { @@ -510,6 +509,7 @@ ReSimplify: // inputs modeled as normal uses instead of implicit uses. As such, truncate // off all but the first operand (the callee). FIXME: Change isel. case X86::TAILJMPr64: + case X86::TAILJMPr64_REX: case X86::CALL64r: case X86::CALL64pcrel32: { unsigned Opcode = OutMI.getOpcode(); @@ -546,6 +546,24 @@ ReSimplify: break; } + case X86::DEC16r: + case X86::DEC32r: + case X86::INC16r: + case X86::INC32r: + // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions. + if (!AsmPrinter.getSubtarget().is64Bit()) { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::DEC16r: Opcode = X86::DEC16r_alt; break; + case X86::DEC32r: Opcode = X86::DEC32r_alt; break; + case X86::INC16r: Opcode = X86::INC16r_alt; break; + case X86::INC32r: Opcode = X86::INC32r_alt; break; + } + OutMI.setOpcode(Opcode); + } + break; + // These are pseudo-ops for OR to help with the OR->ADD transformation. We do // this with an ugly goto in case the resultant OR uses EAX and needs the // short form. @@ -559,28 +577,6 @@ ReSimplify: case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; - // The assembler backend wants to see branches in their small form and relax - // them to their large form. The JIT can only handle the large form because - // it does not do relaxation. For now, translate the large form to the - // small one here. - case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break; - case X86::JO_4: OutMI.setOpcode(X86::JO_1); break; - case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break; - case X86::JB_4: OutMI.setOpcode(X86::JB_1); break; - case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break; - case X86::JE_4: OutMI.setOpcode(X86::JE_1); break; - case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break; - case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break; - case X86::JA_4: OutMI.setOpcode(X86::JA_1); break; - case X86::JS_4: OutMI.setOpcode(X86::JS_1); break; - case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break; - case X86::JP_4: OutMI.setOpcode(X86::JP_1); break; - case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break; - case X86::JL_4: OutMI.setOpcode(X86::JL_1); break; - case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break; - case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break; - case X86::JG_4: OutMI.setOpcode(X86::JG_1); break; - // Atomic load and store require a separate pseudo-inst because Acquire // implies mayStore and Release implies mayLoad; fix these to regular MOV // instructions here @@ -625,13 +621,13 @@ ReSimplify: // MOV64ao8, MOV64o8a // XCHG16ar, XCHG32ar, XCHG64ar case X86::MOV8mr_NOREX: - case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break; + case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break; case X86::MOV8rm_NOREX: - case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break; - case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break; - case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break; - case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; - case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break; + case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break; + case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break; + case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break; case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break; @@ -808,6 +804,58 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu } // while (NumBytes) } +static void LowerSTATEPOINT(MCStreamer &OS, StackMaps &SM, + const MachineInstr &MI, bool Is64Bit, + const TargetMachine& TM, + const MCSubtargetInfo& STI, + X86MCInstLower &MCInstLowering) { + assert(Is64Bit && "Statepoint currently only supports X86-64"); + + // Lower call target and choose correct opcode + const MachineOperand &call_target = StatepointOpers(&MI).getCallTarget(); + MCOperand call_target_mcop; + unsigned call_opcode; + switch (call_target.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + call_target_mcop = MCInstLowering.LowerSymbolOperand( + call_target, + MCInstLowering.GetSymbolFromOperand(call_target)); + call_opcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // address. You'll fail asserts during load & relocation if this + // symbol is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Immediate: + call_target_mcop = MCOperand::CreateImm(call_target.getImm()); + call_opcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // immediate. You'll fail asserts during load & relocation if this + // address is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Register: + call_target_mcop = MCOperand::CreateReg(call_target.getReg()); + call_opcode = X86::CALL64r; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } + + // Emit call + MCInst call_inst; + call_inst.setOpcode(call_opcode); + call_inst.addOperand(call_target_mcop); + OS.EmitInstruction(call_inst, STI); + + // Record our statepoint node in the same section used by STACKMAP + // and PATCHPOINT + SM.recordStatepoint(MI); +} + + // Lower a stackmap of the form: // <id>, <shadowBytes>, ... void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { @@ -941,8 +989,7 @@ static std::string getShuffleComment(const MachineOperand &DstOp, void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); - const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: @@ -963,8 +1010,14 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { break; } case X86::TAILJMPr: + case X86::TAILJMPm: case X86::TAILJMPd: + case X86::TAILJMPr64: + case X86::TAILJMPm64: case X86::TAILJMPd64: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: + case X86::TAILJMPd64_REX: // Lower these as normal, but add some comments. OutStreamer.AddComment("TAILCALL"); break; @@ -1030,6 +1083,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addExpr(DotExpr)); return; } + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), TM, + getSubtargetInfo(), MCInstLowering); case TargetOpcode::STACKMAP: return LowerSTACKMAP(*MI); diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp index 568dc22..ac2cdc8 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -8,7 +8,26 @@ //===----------------------------------------------------------------------===// #include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; void X86MachineFunctionInfo::anchor() { } + +void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { + if (!RestoreBasePointerOffset) { + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + for (const MCPhysReg *CSR = + RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF); + unsigned Reg = *CSR; + ++CSR) + { + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + RestoreBasePointerOffset -= SlotSize; + } + } +} + diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index 79a51b3..d598b55 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineValueType.h" #include <vector> @@ -31,6 +32,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// contains stack pointer re-alignment code which requires FP. bool ForceFramePointer; + /// RestoreBasePointerOffset - Non-zero if the function has base pointer + /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a + /// displacement from the frame pointer to a slot where the base pointer + /// is stashed. + signed char RestoreBasePointerOffset; + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize; @@ -43,6 +50,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// ReturnAddrIndex - FrameIndex for return slot. int ReturnAddrIndex; + /// \brief FrameIndex for return slot. + int FrameAddrIndex; + /// TailCallReturnAddrDelta - The number of bytes by which return address /// stack slot is moved as the result of tail call optimization. int TailCallReturnAddrDelta; @@ -70,28 +80,22 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackSize; /// NumLocalDynamics - Number of local-dynamic TLS accesses. unsigned NumLocalDynamics; - -public: - /// Describes a register that needs to be forwarded from the prologue to a - /// musttail call. - struct Forward { - Forward(unsigned VReg, MCPhysReg PReg, MVT VT) - : VReg(VReg), PReg(PReg), VT(VT) {} - unsigned VReg; - MCPhysReg PReg; - MVT VT; - }; + /// HasPushSequences - Keeps track of whether this function uses sequences + /// of pushes to pass function parameters. + bool HasPushSequences; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. - std::vector<Forward> ForwardedMustTailRegParms; + SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms; public: X86MachineFunctionInfo() : ForceFramePointer(false), + RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), + FrameAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), @@ -100,13 +104,16 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} explicit X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), + RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), + FrameAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), @@ -115,11 +122,19 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + bool getHasPushSequences() const { return HasPushSequences; } + void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } + + bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } + void setRestoreBasePointer(const MachineFunction *MF); + int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } @@ -129,6 +144,9 @@ public: int getRAIndex() const { return ReturnAddrIndex; } void setRAIndex(int Index) { ReturnAddrIndex = Index; } + int getFAIndex() const { return FrameAddrIndex; } + void setFAIndex(int Index) { FrameAddrIndex = Index; } + int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} @@ -156,7 +174,7 @@ public: unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } - std::vector<Forward> &getForwardedMustTailRegParms() { + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } }; diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index adc05b2..143e70b 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -51,7 +51,7 @@ namespace { struct PadShortFunc : public MachineFunctionPass { static char ID; PadShortFunc() : MachineFunctionPass(ID) - , Threshold(4), TM(nullptr), TII(nullptr) {} + , Threshold(4), STI(nullptr), TII(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -79,7 +79,7 @@ namespace { // VisitedBBs - Cache of previously visited BBs. DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs; - const TargetMachine *TM; + const X86Subtarget *STI; const TargetInstrInfo *TII; }; @@ -93,19 +93,16 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - const AttributeSet &FnAttrs = MF.getFunction()->getAttributes(); - if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize)) { + if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || + MF.getFunction()->hasFnAttribute(Attribute::MinSize)) { return false; } - TM = &MF.getTarget(); - if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) + STI = &MF.getSubtarget<X86Subtarget>(); + if (!STI->padShortFunctions()) return false; - TII = TM->getSubtargetImpl()->getInstrInfo(); + TII = STI->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); @@ -195,8 +192,7 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, return true; } - CyclesToEnd += TII->getInstrLatency( - TM->getSubtargetImpl()->getInstrItineraryData(), MI); + CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI); } VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index a4a366d..cab7ce8 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -66,21 +66,22 @@ X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) Is64Bit = Subtarget.is64Bit(); IsWin64 = Subtarget.isTargetWin64(); + // Use a callee-saved register as the base pointer. These registers must + // not conflict with any ABI requirements. For example, in 32-bit mode PIC + // requires GOT in the EBX register before function calls via PLT GOT pointer. if (Is64Bit) { SlotSize = 8; - StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? - X86::RSP : X86::ESP; - FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? - X86::RBP : X86::EBP; + bool Use64BitReg = + Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + StackPtr = Use64BitReg ? X86::RSP : X86::ESP; + FramePtr = Use64BitReg ? X86::RBP : X86::EBP; + BasePtr = Use64BitReg ? X86::RBX : X86::EBX; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; + BasePtr = X86::ESI; } - // Use a callee-saved register as the base pointer. These registers must - // not conflict with any ABI requirements. For example, in 32-bit mode PIC - // requires GOT in the EBX register before function calls via PLT GOT pointer. - BasePtr = Is64Bit ? X86::RBX : X86::ESI; } bool @@ -354,7 +355,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true); + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64, + false); + for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); } @@ -445,10 +448,8 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const Function *F = MF.getFunction(); unsigned StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); + bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || + F->hasFnAttribute(Attribute::StackAlignment)); // If we've requested that we force align the stack do so now. if (ForceStackAlign) @@ -468,8 +469,6 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { - assert(SPAdj == 0 && "Unexpected"); - MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -506,6 +505,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } else FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + if (BasePtr == StackPtr) + FIOffset += SPAdj; + // The frame index format for stackmaps and patchpoints is different from the // X86 format. It only has a FI and an offset. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { @@ -535,6 +537,14 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? FramePtr : StackPtr; } +unsigned X86RegisterInfo::getPtrSizedFrameRegister( + const MachineFunction &MF) const { + unsigned FrameReg = getFrameRegister(MF); + if (Subtarget.isTarget64BitILP32()) + FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); + return FrameReg; +} + namespace llvm { unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, bool High) { diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index cc0a7b2..406b1fc 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -122,6 +122,7 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; unsigned getStackRegister() const { return StackPtr; } unsigned getBaseRegister() const { return BasePtr; } // FIXME: Move to FrameInfok diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 311a717..2e735fa 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -263,14 +263,22 @@ def FS : X86Reg<"fs", 4>; def GS : X86Reg<"gs", 5>; // Debug registers -def DR0 : X86Reg<"dr0", 0>; -def DR1 : X86Reg<"dr1", 1>; -def DR2 : X86Reg<"dr2", 2>; -def DR3 : X86Reg<"dr3", 3>; -def DR4 : X86Reg<"dr4", 4>; -def DR5 : X86Reg<"dr5", 5>; -def DR6 : X86Reg<"dr6", 6>; -def DR7 : X86Reg<"dr7", 7>; +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; +def DR8 : X86Reg<"dr8", 8>; +def DR9 : X86Reg<"dr9", 9>; +def DR10 : X86Reg<"dr10", 10>; +def DR11 : X86Reg<"dr11", 11>; +def DR12 : X86Reg<"dr12", 12>; +def DR13 : X86Reg<"dr13", 13>; +def DR14 : X86Reg<"dr14", 14>; +def DR15 : X86Reg<"dr15", 15>; // Control registers def CR0 : X86Reg<"cr0", 0>; @@ -317,7 +325,7 @@ def GR8 : RegisterClass<"X86", [i8], 8, R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { let AltOrders = [(sub GR8, AH, BH, CH, DH)]; let AltOrderSelect = [{ - return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + return MF.getSubtarget<X86Subtarget>().is64Bit(); }]; } @@ -369,7 +377,7 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, AH, CH, DH, BL, BH)> { let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; let AltOrderSelect = [{ - return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + return MF.getSubtarget<X86Subtarget>().is64Bit(); }]; } // GR16_NOREX - GR16 registers which do not require a REX prefix. @@ -461,18 +469,18 @@ def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} -def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} -def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} -def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} -def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} -def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} -def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 73a3230..61c0600 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -1895,7 +1895,7 @@ def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; // x,m / v,v,m. def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 4; + let Latency = 9; let NumMicroOps = 2; let ResourceCycles = [1, 1]; } diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 821044f..7feabf6 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -57,7 +57,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); #ifndef NDEBUG // If the base register might conflict with our physical registers, bail out. @@ -199,17 +200,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, return Chain; } -SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, - SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo) const { +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 9d877c9..de30c75 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -257,17 +257,17 @@ void X86Subtarget::initializeEnvironment() { HasVLX = false; HasADX = false; HasSHA = false; - HasSGX = false; HasPRFCHW = false; HasRDSEED = false; - HasSMAP = false; IsBTMemSlow = false; IsSHLDSlow = false; IsUAMemFast = false; - HasVectorUAMem = false; + IsUAMem32Slow = false; + HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; - HasSlowDivide = false; + HasSlowDivide32 = false; + HasSlowDivide64 = false; PadShortFunctions = false; CallRegIndirect = false; LEAUsesAG = false; @@ -280,46 +280,6 @@ void X86Subtarget::initializeEnvironment() { MaxInlineSizeThreshold = 128; } -static std::string computeDataLayout(const Triple &TT) { - // X86 is little endian - std::string Ret = "e"; - - Ret += DataLayout::getManglingComponent(TT); - // X86 and x32 have 32 bit pointers. - if ((TT.isArch64Bit() && - (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || - !TT.isArch64Bit()) - Ret += "-p:32:32"; - - // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; - - // Some ABIs align long double to 128 bits, others to 32. - if (TT.isOSNaCl()) - ; // No f80 - else if (TT.isArch64Bit() || TT.isOSDarwin()) - Ret += "-f80:128"; - else - Ret += "-f80:32"; - - // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (TT.isArch64Bit()) - Ret += "-n8:16:32:64"; - else - Ret += "-n8:16:32"; - - // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!TT.isArch64Bit() && TT.isOSWindows()) - Ret += "-S32"; - else - Ret += "-S128"; - - return Ret; -} - X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { initializeEnvironment(); @@ -332,16 +292,16 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), - DL(computeDataLayout(TargetTriple)), StackAlignOverride(StackAlignOverride), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown, - getStackAlignment(), is64Bit() ? -8 : -4) { + TSInfo(*TM.getDataLayout()), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), + FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), + is64Bit() ? -8 : -4) { // Determine the PICStyle based on the target selected. if (TM.getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 091b6c4..4c31f78 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -31,7 +31,7 @@ class GlobalValue; class StringRef; class TargetMachine; -/// PICStyles - The X86 backend supports a number of different styles of PIC. +/// The X86 backend supports a number of different styles of PIC. /// namespace PICStyles { enum Style { @@ -58,138 +58,136 @@ protected: Others, IntelAtom, IntelSLM }; - /// X86ProcFamily - X86 processor family: Intel Atom, and others + /// X86 processor family: Intel Atom, and others X86ProcFamilyEnum X86ProcFamily; - /// PICStyle - Which PIC style to use - /// + /// Which PIC style to use PICStyles::Style PICStyle; - /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or - /// none supported. + /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel; - /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported. - /// + /// 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; - /// HasCMov - True if this processor has conditional move instructions + /// True if this processor has conditional move instructions /// (generally pentium pro+). bool HasCMov; - /// HasX86_64 - True if the processor supports X86-64 instructions. - /// + /// True if the processor supports X86-64 instructions. bool HasX86_64; - /// HasPOPCNT - True if the processor supports POPCNT. + /// True if the processor supports POPCNT. bool HasPOPCNT; - /// HasSSE4A - True if the processor supports SSE4A instructions. + /// True if the processor supports SSE4A instructions. bool HasSSE4A; - /// HasAES - Target has AES instructions + /// Target has AES instructions bool HasAES; - /// HasPCLMUL - Target has carry-less multiplication + /// Target has carry-less multiplication bool HasPCLMUL; - /// HasFMA - Target has 3-operand fused multiply-add + /// Target has 3-operand fused multiply-add bool HasFMA; - /// HasFMA4 - Target has 4-operand fused multiply-add + /// Target has 4-operand fused multiply-add bool HasFMA4; - /// HasXOP - Target has XOP instructions + /// Target has XOP instructions bool HasXOP; - /// HasTBM - Target has TBM instructions. + /// Target has TBM instructions. bool HasTBM; - /// HasMOVBE - True if the processor has the MOVBE instruction. + /// True if the processor has the MOVBE instruction. bool HasMOVBE; - /// HasRDRAND - True if the processor has the RDRAND instruction. + /// True if the processor has the RDRAND instruction. bool HasRDRAND; - /// HasF16C - Processor has 16-bit floating point conversion instructions. + /// Processor has 16-bit floating point conversion instructions. bool HasF16C; - /// HasFSGSBase - Processor has FS/GS base insturctions. + /// Processor has FS/GS base insturctions. bool HasFSGSBase; - /// HasLZCNT - Processor has LZCNT instruction. + /// Processor has LZCNT instruction. bool HasLZCNT; - /// HasBMI - Processor has BMI1 instructions. + /// Processor has BMI1 instructions. bool HasBMI; - /// HasBMI2 - Processor has BMI2 instructions. + /// Processor has BMI2 instructions. bool HasBMI2; - /// HasRTM - Processor has RTM instructions. + /// Processor has RTM instructions. bool HasRTM; - /// HasHLE - Processor has HLE. + /// Processor has HLE. bool HasHLE; - /// HasADX - Processor has ADX instructions. + /// Processor has ADX instructions. bool HasADX; - /// HasSHA - Processor has SHA instructions. + /// Processor has SHA instructions. bool HasSHA; - /// HasSGX - Processor has SGX instructions. - bool HasSGX; - - /// HasPRFCHW - Processor has PRFCHW instructions. + /// Processor has PRFCHW instructions. bool HasPRFCHW; - /// HasRDSEED - Processor has RDSEED instructions. + /// Processor has RDSEED instructions. bool HasRDSEED; - /// HasSMAP - Processor has SMAP instructions. - bool HasSMAP; - - /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; - /// IsSHLDSlow - True if SHLD instructions are slow. + /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// IsUAMemFast - True if unaligned memory access is fast. + /// True if unaligned memory access is fast. bool IsUAMemFast; - /// HasVectorUAMem - True if SIMD operations can have unaligned memory - /// operands. This may require setting a feature bit in the processor. - bool HasVectorUAMem; + /// True if unaligned 32-byte memory accesses are slow. + bool IsUAMem32Slow; + + /// True if SSE operations can have unaligned memory operands. + /// This may require setting a configuration bit in the processor. + bool HasSSEUnalignedMem; - /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; + /// True if this processor has the CMPXCHG16B instruction; /// this is true for most x86-64 chips, but not the first AMD chips. bool HasCmpxchg16b; - /// UseLeaForSP - True if the LEA instruction should be used for adjusting + /// True if the LEA instruction should be used for adjusting /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; - /// HasSlowDivide - True if smaller divides are significantly faster than - /// full divides and should be used when possible. - bool HasSlowDivide; + /// True if 8-bit divisions are significantly faster than + /// 32-bit divisions and should be used when possible. + bool HasSlowDivide32; + + /// True if 16-bit divides are significantly faster than + /// 64-bit divisions and should be used when possible. + bool HasSlowDivide64; - /// PadShortFunctions - True if the short functions should be padded to prevent + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; - /// CallRegIndirect - True if the Calls with memory reference should be converted + /// True if the Calls with memory reference should be converted /// to a register-based indirect call. bool CallRegIndirect; - /// LEAUsesAG - True if the LEA instruction inputs have to be ready at - /// address generation (AG) time. + + /// True if the LEA instruction inputs have to be ready at address generation + /// (AG) time. bool LEAUsesAG; - /// SlowLEA - True if the LEA instruction with certain arguments is slow + /// True if the LEA instruction with certain arguments is slow bool SlowLEA; - /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; /// Use the RSQRT* instructions to optimize square root calculations. @@ -201,7 +199,7 @@ protected: /// For this to be profitable, the cost of FDIV must be /// substantially higher than normal FP ops like FADD and FMUL. bool UseReciprocalEst; - + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -220,7 +218,7 @@ protected: /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX; - /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -228,26 +226,24 @@ protected: /// unsigned MaxInlineSizeThreshold; - /// TargetTriple - What processor and OS we're targeting. + /// What processor and OS we're targeting. Triple TargetTriple; /// Instruction itineraries for scheduling InstrItineraryData InstrItins; private: - // Calculates type size & alignment - const DataLayout DL; - /// StackAlignOverride - Override the stack alignment. + /// Override the stack alignment. unsigned StackAlignOverride; - /// In64BitMode - True if compiling for 64-bit, false for 16-bit or 32-bit. + /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; - /// In32BitMode - True if compiling for 32-bit, false for 16-bit or 64-bit. + /// True if compiling for 32-bit, false for 16-bit or 64-bit. bool In32BitMode; - /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. + /// True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; X86SelectionDAGInfo TSInfo; @@ -269,7 +265,6 @@ public: return &TLInfo; } const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const X86FrameLowering *getFrameLowering() const override { return &FrameLowering; } @@ -280,12 +275,12 @@ public: return &getInstrInfo()->getRegisterInfo(); } - /// getStackAlignment - Returns the minimum alignment known to hold of the + /// Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. unsigned getStackAlignment() const { return stackAlignment; } - /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } @@ -294,7 +289,7 @@ public: void ParseSubtargetFeatures(StringRef CPU, StringRef FS); private: - /// \brief Initialize the full set of dependencies so we can use an initializer + /// Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); @@ -316,13 +311,13 @@ public: /// Is this x86_64 with the ILP32 programming model (x32 ABI)? bool isTarget64BitILP32() const { return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 || - TargetTriple.getOS() == Triple::NaCl); + TargetTriple.isOSNaCl()); } /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? bool isTarget64BitLP64() const { return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 && - TargetTriple.getOS() != Triple::NaCl); + !TargetTriple.isOSNaCl()); } PICStyles::Style getPICStyle() const { return PICStyle; } @@ -363,17 +358,17 @@ public: bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } - bool hasSGX() const { return HasSGX; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } - bool hasSMAP() const { return HasSMAP; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } - bool hasVectorUAMem() const { return HasVectorUAMem; } + bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } + bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } - bool hasSlowDivide() const { return HasSlowDivide; } + bool hasSlowDivide32() const { return HasSlowDivide32; } + bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } @@ -394,16 +389,14 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } - bool isTargetFreeBSD() const { - return TargetTriple.getOS() == Triple::FreeBSD; - } - bool isTargetSolaris() const { - return TargetTriple.getOS() == Triple::Solaris; - } + bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } + bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } + bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } + bool isTargetPS4() const { return TargetTriple.isPS4(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } - bool isTargetMacho() const { return TargetTriple.isOSBinFormatMachO(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } @@ -469,13 +462,11 @@ public: unsigned char ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM)const; - /// ClassifyBlockAddressReference - Classify a blockaddress reference for the - /// current subtarget according to how we should reference it in a non-pcrel - /// context. + /// Classify a blockaddress reference for the current subtarget according to + /// how we should reference it in a non-pcrel context. unsigned char ClassifyBlockAddressReference() const; - /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls - /// to immediate address. + /// Return true if the subtarget allows calls to immediate address. bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const; /// This function returns the name of a function which has an interface @@ -494,8 +485,7 @@ public: bool enableEarlyIfConversion() const override; - /// getInstrItins = Return the instruction itineraries based on the - /// subtarget selection. + /// Return the instruction itineraries based on the subtarget selection. const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8802feb..4bde053 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -14,9 +14,10 @@ #include "X86TargetMachine.h" #include "X86.h" #include "X86TargetObjectFile.h" +#include "X86TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" @@ -47,6 +48,46 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { llvm_unreachable("unknown subtarget type"); } +static std::string computeDataLayout(const Triple &TT) { + // X86 is little endian + std::string Ret = "e"; + + Ret += DataLayout::getManglingComponent(TT); + // X86 and x32 have 32 bit pointers. + if ((TT.isArch64Bit() && + (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || + !TT.isArch64Bit()) + Ret += "-p:32:32"; + + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. + if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // Some ABIs align long double to 128 bits, others to 32. + if (TT.isOSNaCl()) + ; // No f80 + else if (TT.isArch64Bit() || TT.isOSDarwin()) + Ret += "-f80:128"; + else + Ret += "-f80:32"; + + // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. + if (TT.isArch64Bit()) + Ret += "-n8:16:32:64"; + else + Ret += "-n8:16:32"; + + // The stack is aligned to 32 bits on some ABIs and 128 bits on others. + if (!TT.isArch64Bit() && TT.isOSWindows()) + Ret += "-S32"; + else + Ret += "-S128"; + + return Ret; +} + /// X86TargetMachine ctor - Create an X86 target. /// X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, @@ -55,6 +96,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), + DL(computeDataLayout(Triple(TT))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) @@ -74,11 +116,8 @@ X86TargetMachine::~X86TargetMachine() {} const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { - AttributeSet FnAttrs = F.getAttributes(); - Attribute CPUAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); - Attribute FSAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); std::string CPU = !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString().str() @@ -92,8 +131,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // function before we can generate a subtarget. We also need to use // it as a key for the subtarget since that can be the only difference // between two functions. - Attribute SFAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float"); + Attribute SFAttr = F.getFnAttribute("use-soft-float"); bool SoftFloat = !SFAttr.hasAttribute(Attribute::None) ? SFAttr.getValueAsString() == "true" : Options.UseSoftFloat; @@ -120,15 +158,12 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, cl::init(true)); //===----------------------------------------------------------------------===// -// X86 Analysis Pass Setup +// X86 TTI query. //===----------------------------------------------------------------------===// -void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our X86 pass. This - // allows the X86 pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createX86TargetTransformInfoPass(this)); +TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); }); } @@ -147,16 +182,12 @@ public: return getTM<X86TargetMachine>(); } - const X86Subtarget &getX86Subtarget() const { - return *getX86TargetMachine().getSubtargetImpl(); - } - void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreEmitPass() override; }; } // namespace @@ -175,7 +206,8 @@ bool X86PassConfig::addInstSelector() { addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); // For ELF, cleanup any local-dynamic TLS accesses. - if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) + if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && + getOptLevel() != CodeGenOpt::None) addPass(createCleanupLocalDynamicTLSPass()); addPass(createX86GlobalBaseRegPass()); @@ -188,32 +220,23 @@ bool X86PassConfig::addILPOpts() { return true; } -bool X86PassConfig::addPreRegAlloc() { - return false; // -print-machineinstr shouldn't print after this. +void X86PassConfig::addPreRegAlloc() { + addPass(createX86CallFrameOptimization()); } -bool X86PassConfig::addPostRegAlloc() { +void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); - return true; // -print-machineinstr should print after this. } -bool X86PassConfig::addPreEmitPass() { - bool ShouldPrint = false; - if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) { +void X86PassConfig::addPreEmitPass() { + if (getOptLevel() != CodeGenOpt::None) addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); - ShouldPrint = true; - } - if (UseVZeroUpper) { + if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); - ShouldPrint = true; - } if (getOptLevel() != CodeGenOpt::None) { addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); - ShouldPrint = true; } - - return ShouldPrint; } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 916278c..283858d 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -24,22 +24,22 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - X86Subtarget Subtarget; + // Calculates type size & alignment + const DataLayout DL; + X86Subtarget Subtarget; mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; public: - X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; - + const DataLayout *getDataLayout() const override { return &DL; } const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86Subtarget *getSubtargetImpl(const Function &F) const override; - /// \brief Register X86 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index f8bcd61..1d1c32e 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -21,6 +21,11 @@ using namespace llvm; using namespace dwarf; +X86_64MachoTargetObjectFile::X86_64MachoTargetObjectFile() + : TargetLoweringObjectFileMachO() { + SupportIndirectSymViaGOTPCRel = true; +} + const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI, @@ -46,6 +51,17 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( return TM.getSymbol(GV, Mang); } +const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( + const MCSymbol *Sym, int64_t Offset) const { + // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry + // from a data section. In case there's an additional offset, then use + // foo@GOTPCREL+4+<offset>. + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Off = MCConstantExpr::Create(Offset+4, getContext()); + return MCBinaryExpr::CreateAdd(Res, Off, getContext()); +} + void X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 6a6988a..f745538 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -19,6 +19,8 @@ namespace llvm { /// x86-64. class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: + X86_64MachoTargetObjectFile(); + const MCExpr * getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, Mangler &Mang, const TargetMachine &TM, @@ -30,6 +32,10 @@ namespace llvm { MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI) const override; + + const MCExpr * + getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + int64_t Offset) const override; }; /// X86LinuxTargetObjectFile - This implementation is used for linux x86 diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 2b70fd0..5136619 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -14,9 +14,9 @@ /// //===----------------------------------------------------------------------===// -#include "X86.h" -#include "X86TargetMachine.h" +#include "X86TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" @@ -25,123 +25,22 @@ using namespace llvm; #define DEBUG_TYPE "x86tti" -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeX86TTIPass(PassRegistry &); -} - -namespace { - -class X86TTI final : public ImmutablePass, public TargetTransformInfo { - const X86Subtarget *ST; - const X86TargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), ST(TM->getSubtargetImpl()), - TLI(TM->getSubtargetImpl()->getTargetLowering()) { - initializeX86TTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { - pushTTIStack(this); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo*)this; - return this; - } - - /// \name Scalar TTI Implementations - /// @{ - PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - - /// @} - - /// \name Vector TTI Implementations - /// @{ - - unsigned getNumberOfRegisters(bool Vector) const override; - unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaxInterleaveFactor() const override; - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind, OperandValueProperties, - OperandValueProperties) const override; - unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) const override; - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const override; - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const override; - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const override; - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const override; - - unsigned getAddressComputationCost(Type *PtrTy, - bool IsComplex) const override; - - unsigned getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) const override; - - unsigned getIntImmCost(int64_t) const; - - unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; - - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - - /// @} -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", - "X86 Target Transform Info", true, true, false) -char X86TTI::ID = 0; - -ImmutablePass * -llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { - return new X86TTI(TM); -} - - //===----------------------------------------------------------------------===// // // X86 cost model. // //===----------------------------------------------------------------------===// -X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { +TargetTransformInfo::PopcntSupportKind +X86TTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); // TODO: Currently the __builtin_popcount() implementation using SSE3 // instructions is inefficient. Once the problem is fixed, we should // call ST->hasSSE3() instead of ST->hasPOPCNT(). - return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; + return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; } -unsigned X86TTI::getNumberOfRegisters(bool Vector) const { +unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasSSE1()) return 0; @@ -153,7 +52,7 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { return 8; } -unsigned X86TTI::getRegisterBitWidth(bool Vector) const { +unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (Vector) { if (ST->hasAVX512()) return 512; if (ST->hasAVX()) return 256; @@ -167,7 +66,7 @@ unsigned X86TTI::getRegisterBitWidth(bool Vector) const { } -unsigned X86TTI::getMaxInterleaveFactor() const { +unsigned X86TTIImpl::getMaxInterleaveFactor() { if (ST->isAtom()) return 1; @@ -179,10 +78,10 @@ unsigned X86TTI::getMaxInterleaveFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, OperandValueKind Op1Info, - OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) const { +unsigned X86TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); @@ -352,7 +251,7 @@ unsigned X86TTI::getArithmeticInstrCost( { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. @@ -437,17 +336,16 @@ unsigned X86TTI::getArithmeticInstrCost( return LT.first * 6; // Fallback to the default implementation. - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, - Op2Info); + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } -unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) const { +unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only estimate the cost of reverse and alternate shuffles. - if (Kind != SK_Reverse && Kind != SK_Alternate) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - if (Kind == SK_Reverse) { + if (Kind == TTI::SK_Reverse) { std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); unsigned Cost = 1; if (LT.second.getSizeInBits() > 128) @@ -457,7 +355,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, return Cost * LT.first; } - if (Kind == SK_Alternate) { + if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); @@ -525,7 +423,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or }; - + if (ST->hasSSSE3()) { int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx != -1) @@ -538,7 +436,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd - + // This is expanded into a long sequence of four extract + four insert. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. @@ -546,17 +444,17 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} }; - // Fall-back (SSE3 and SSE2). + // Fall-back (SSE3 and SSE2). int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx != -1) return LT.first * SSEAltShuffleTbl[Idx].Cost; - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { +unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -638,7 +536,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); static const TypeConversionCostTblEntry<MVT::SimpleValueType> AVX2ConversionTbl[] = { @@ -757,11 +655,11 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return AVXConversionTbl[Idx].Cost; } - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { +unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); @@ -827,11 +725,11 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return LT.first * SSE42CostTbl[Idx].Cost; } - return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { +unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -851,26 +749,27 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return 0; } - return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); + return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, - bool Extract) const { +unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) { assert (Ty->isVectorTy() && "Can only scalarize vectors"); unsigned Cost = 0; for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { if (Insert) - Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); if (Extract) - Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); } return Cost; } -unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const { +unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -888,10 +787,8 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { - unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, - VTy->getScalarType(), - Alignment, - AddressSpace); + unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), + Alignment, AddressSpace); unsigned SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, Opcode==Instruction::Store); @@ -915,7 +812,60 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } -unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { +unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); + if (!SrcVTy) + // To calculate scalar take the regular cost, without mask + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + + unsigned NumElem = SrcVTy->getVectorNumElements(); + VectorType *MaskTy = + VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + !isPowerOf2_32(NumElem)) { + // Scalarization + unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + unsigned ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, + Type::getInt8Ty(getGlobalContext()), NULL); + unsigned BranchCost = getCFInstrCost(Instruction::Br); + unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + unsigned ValueSplitCost = + getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, + Opcode == Instruction::Store); + unsigned MemopCost = + NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; + } + + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); + unsigned Cost = 0; + if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && + LT.second.getVectorNumElements() == NumElem) + // Promotion requires expand/truncate for data and a shuffle for mask. + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); + + else if (LT.second.getVectorNumElements() > NumElem) { + VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), + LT.second.getVectorNumElements()); + // Expanding requires fill mask with zeroes + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + } + if (!ST->hasAVX512()) + return Cost + LT.first*4; // Each maskmov costs 4 + + // AVX-512 masked load/store is cheapper + return Cost+LT.first; +} + +unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -925,22 +875,22 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { if (Ty->isVectorTy() && IsComplex) return NumVectorInstToHideOverhead; - return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, IsComplex); } -unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) const { - +unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); - + MVT MTy = LT.second; - + int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput - // and make it as the cost. - + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, @@ -948,7 +898,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v8i16, 5 }, }; - + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, @@ -967,7 +917,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; - + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, @@ -978,14 +928,14 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, }; - + if (IsPairwise) { if (ST->hasAVX()) { int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); if (Idx != -1) return LT.first * AVX1CostTblPairWise[Idx].Cost; } - + if (ST->hasSSE42()) { int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); if (Idx != -1) @@ -997,7 +947,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, if (Idx != -1) return LT.first * AVX1CostTblNoPairWise[Idx].Cost; } - + if (ST->hasSSE42()) { int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); if (Idx != -1) @@ -1005,23 +955,23 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, } } - return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); + return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); } /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned X86TTI::getIntImmCost(int64_t Val) const { +unsigned X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) - return TCC_Free; + return TTI::TCC_Free; if (isInt<32>(Val)) - return TCC_Basic; + return TTI::TCC_Basic; - return 2 * TCC_Basic; + return 2 * TTI::TCC_Basic; } -unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1033,10 +983,10 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { // Fixme: Create a cost model for types larger than i128 once the codegen // issues have been fixed. if (BitSize > 128) - return TCC_Free; + return TTI::TCC_Free; if (Imm == 0) - return TCC_Free; + return TTI::TCC_Free; // Sign-extend all constants to a multiple of 64-bit. APInt ImmVal = Imm; @@ -1055,26 +1005,27 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { return std::max(1U, Cost); } -unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; unsigned ImmIdx = ~0U; switch (Opcode) { - default: return TCC_Free; + default: + return TTI::TCC_Free; case Instruction::GetElementPtr: // Always hoist the base address of a GetElementPtr. This prevents the // creation of new constants for every base constant that gets constant // folded with the offset. if (Idx == 0) - return 2 * TCC_Basic; - return TCC_Free; + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; case Instruction::Store: ImmIdx = 0; break; @@ -1096,7 +1047,7 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, case Instruction::LShr: case Instruction::AShr: if (Idx == 1) - return TCC_Free; + return TTI::TCC_Free; break; case Instruction::Trunc: case Instruction::ZExt: @@ -1114,27 +1065,28 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, if (Idx == ImmIdx) { unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = X86TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast<unsigned>(TCC_Free) - : Cost; + unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<unsigned>(TTI::TCC_Free) + : Cost; } - return X86TTI::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty); } -unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; switch (IID) { - default: return TCC_Free; + default: + return TTI::TCC_Free; case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: @@ -1142,17 +1094,33 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) - return TCC_Free; + return TTI::TCC_Free; break; case Intrinsic::experimental_stackmap: if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; } - return X86TTI::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty); +} + +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { + int DataWidth = DataTy->getPrimitiveSizeInBits(); + + // Todo: AVX512 allows gather/scatter, works with strided and random as well + if ((DataWidth < 32) || (Consecutive == 0)) + return false; + if (ST->hasAVX512() || ST->hasAVX2()) + return true; + return false; } + +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { + return isLegalMaskedLoad(DataType, Consecutive); +} + diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h new file mode 100644 index 0000000..9f0adcf --- /dev/null +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -0,0 +1,112 @@ +//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// X86 target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { + typedef BasicTTIImplBase<X86TTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const X86Subtarget *ST; + const X86TargetLowering *TLI; + + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const X86Subtarget *getST() const { return ST; } + const X86TargetLowering *getTLI() const { return TLI; } + +public: + explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) + : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + X86TTIImpl(const X86TTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + X86TTIImpl(X86TTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + X86TTIImpl &operator=(const X86TTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + X86TTIImpl &operator=(X86TTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + /// \name Scalar TTI Implementations + /// @{ + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(); + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp); + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + + unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex); + + unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + unsigned getIntImmCost(int64_t); + + unsigned getIntImmCost(const APInt &Imm, Type *Ty); + + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty); + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType, int Consecutive); + bool isLegalMaskedStore(Type *DataType, int Consecutive); + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index d93baeb..99ba4c0 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -9,7 +9,7 @@ // // This file defines the pass which inserts x86 AVX vzeroupper instructions // before calls to SSE encoded functions. This avoids transition latency -// penalty when tranfering control between AVX encoded instructions and old +// penalty when transferring control between AVX encoded instructions and old // SSE encoding mode. // //===----------------------------------------------------------------------===// @@ -171,7 +171,7 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { } /// processBasicBlock - Loop over all of the instructions in the basic block, -/// inserting vzero upper instructions before function calls. +/// inserting vzeroupper instructions before function calls. void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // Start by assuming that the block PASS_THROUGH, which implies no unguarded @@ -202,7 +202,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // If the call won't clobber any YMM register, skip it as well. It usually // happens on helper function calls (such as '_chkstk', '_ftol2') where // standard calling convention is not used (RegMask is not used to mark - // register clobbered and register usage (def/imp-def/use) is well-dfined + // register clobbered and register usage (def/imp-def/use) is well-defined // and explicitly specified. if (MI->isCall() && !callClobbersAnyYmmReg(MI)) continue; @@ -245,25 +245,29 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { } /// runOnMachineFunction - Loop over all of the basic blocks, inserting -/// vzero upper instructions before function calls. +/// vzeroupper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { - const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); if (!ST.hasAVX() || ST.hasAVX512()) return false; - TII = MF.getSubtarget().getInstrInfo(); + TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; + bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + // Fast check: if the function doesn't use any ymm registers, we don't need // to insert any VZEROUPPER instructions. This is constant-time, so it is // cheap in the common case of no ymm use. - bool YMMUsed = false; - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); - i != e; i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; + bool YMMUsed = FnHasLiveInYmm; + if (!YMMUsed) { + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; + i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } } } if (!YMMUsed) { @@ -282,7 +286,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // If any YMM regs are live in to this function, add the entry block to the // DirtySuccessors list - if (checkFnHasLiveInYmm(MRI)) + if (FnHasLiveInYmm) addDirtySuccessor(MF.front()); // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add |