From c6a4f5e819217e1e12c458aed8e7b122e23a3a58 Mon Sep 17 00:00:00 2001 From: Stephen Hines Date: Mon, 21 Jul 2014 00:45:20 -0700 Subject: Update LLVM for rebase to r212749. Includes a cherry-pick of: r212948 - fixes a small issue with atomic calls Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18 --- lib/Target/X86/X86FastISel.cpp | 1265 ++++++++++++++++++++++++++++++++-------- 1 file changed, 1033 insertions(+), 232 deletions(-) (limited to 'lib/Target/X86/X86FastISel.cpp') diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 56bcfa3..ce554ba 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -16,10 +16,12 @@ #include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" +#include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -78,12 +80,14 @@ public: private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); - bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, - bool Aligned = false); - bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, - bool Aligned = false); + MachineMemOperand *MMO = nullptr, bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -107,6 +111,12 @@ private: bool X86SelectDivRem(const Instruction *I); + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); @@ -147,10 +157,182 @@ private: bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); }; } // end anonymous namespace. +static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { + // If both operands are the same, then try to optimize or fold the cmp. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (CI->getOperand(0) != CI->getOperand(1)) + return Predicate; + + switch (Predicate) { + default: llvm_unreachable("Invalid predicate!"); + case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break; + + case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break; + } + + return Predicate; +} + +static std::pair +getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + +static std::pair +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa(Cond)) + return false; + + const auto *EV = cast(Cond); + if (!isa(EV->getAggregateOperand())) + return false; + + const auto *II = cast(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) @@ -180,7 +362,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - unsigned &ResultReg) { + MachineMemOperand *MMO, unsigned &ResultReg) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -228,8 +410,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, } ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc), ResultReg), AM); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + addFullAddress(MIB, AM); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } @@ -237,9 +422,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. -bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, - const X86AddressMode &AM, bool Aligned) { +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -249,7 +434,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + TII.get(X86::AND8ri), AndResult) + .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. @@ -288,13 +474,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, break; } - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM).addReg(ValReg); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM, bool Aligned) { + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa(Val)) Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); @@ -317,10 +508,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, } if (Opc) { - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM) - .addImm(Signed ? (uint64_t) CI->getSExtValue() : - CI->getZExtValue()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() + : CI->getZExtValue()); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } } @@ -329,7 +522,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM, Aligned); + bool ValKill = hasTrivialKill(Val); + return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -355,17 +549,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { return false; // Can't handle TLS yet. - if (const GlobalVariable *GVar = dyn_cast(GV)) - if (GVar->isThreadLocal()) - return false; - - // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how - // it works...). - if (const GlobalAlias *GA = dyn_cast(GV)) - if (const GlobalVariable *GVar = - dyn_cast_or_null(GA->getAliasee())) - if (GVar->isThreadLocal()) - return false; + if (GV->isThreadLocal()) + return false; // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the @@ -696,7 +881,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle DbgLocLImport. + // Can't handle DLL Import. if (GV->hasDLLImportStorageClass()) return false; @@ -749,19 +934,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (S->isAtomic()) return false; - unsigned SABIAlignment = - DL.getABITypeAlignment(S->getValueOperand()->getType()); - bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; + const Value *Val = S->getValueOperand(); + const Value *Ptr = S->getPointerOperand(); MVT VT; - if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; + unsigned Alignment = S->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + bool Aligned = Alignment >= ABIAlignment; + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(1), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); + return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -896,25 +1086,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { /// X86SelectLoad - Select and emit code to implement load instructions. /// -bool X86FastISel::X86SelectLoad(const Instruction *I) { +bool X86FastISel::X86SelectLoad(const Instruction *I) { + const LoadInst *LI = cast(I); + // Atomic loads need special handling. - if (cast(I)->isAtomic()) + if (LI->isAtomic()) return false; MVT VT; - if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) return false; + const Value *Ptr = LI->getPointerOperand(); + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; unsigned ResultReg = 0; - if (X86FastEmitLoad(VT, AM, ResultReg)) { - UpdateValueMap(I, ResultReg); - return true; - } - return false; + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + return false; + + UpdateValueMap(I, ResultReg); + return true; } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { @@ -994,73 +1188,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; - unsigned ResultReg = createResultReg(&X86::GR8RegClass); - unsigned SetCCOpc; - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - switch (CI->getPredicate()) { - case CmpInst::FCMP_OEQ: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: { + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), + ResultReg); + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) return false; + break; + } + case CmpInst::FCMP_TRUE: { + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + ResultReg).addImm(1); + break; + } + } - unsigned EReg = createResultReg(&X86::GR8RegClass); - unsigned NPReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::SETNPr), NPReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); + if (ResultReg) { UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_UNE: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *RHSC = dyn_cast(RHS); + if (RHSC && RHSC->isNullValue()) + RHS = LHS; + } + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETEr, X86::SETNPr, X86::AND8rr }, + { X86::SETNEr, X86::SETPr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; + case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; + } + + ResultReg = createResultReg(&X86::GR8RegClass); + if (SETFOpc) { + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - unsigned NEReg = createResultReg(&X86::GR8RegClass); - unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg) - .addReg(PReg).addReg(NEReg); + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), + ResultReg).addReg(FlagReg1).addReg(FlagReg2); UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break; - case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; - case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break; - case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break; - case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break; - case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break; - default: - return false; - } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + unsigned Opc = X86::getSETFromCond(CC); + if (SwapArgs) - std::swap(Op0, Op1); + std::swap(LHS, RHS); - // Emit a compare of Op0/Op1. - if (!X86FastEmitCompare(Op0, Op1, VT)) + // Emit a compare of LHS/RHS. + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); UpdateValueMap(I, ResultReg); return true; } @@ -1126,73 +1336,88 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). + X86::CondCode CC; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true; + } + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, + // 0.0. + // We don't have to materialize a zero constant for this case and can just + // use %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + // Try to take advantage of fallthrough opportunities. - CmpInst::Predicate Predicate = CI->getPredicate(); if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); Predicate = CmpInst::getInversePredicate(Predicate); } - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA" - + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition + // code check. Instead two branch instructions are required to check all + // the flags. First we change the predicate to a supported condition code, + // which will be the first branch. Later one we will emit the second + // branch. + bool NeedExtraBranch = false; switch (Predicate) { + default: break; case CmpInst::FCMP_OEQ: - std::swap(TrueMBB, FalseMBB); - Predicate = CmpInst::FCMP_UNE; - // FALL THROUGH - case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break; - case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; - case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break; - case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break; - case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break; - case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break; - default: - return false; + std::swap(TrueMBB, FalseMBB); // fall-through + case CmpInst::FCMP_UNE: + NeedExtraBranch = true; + Predicate = CmpInst::FCMP_ONE; + break; } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + bool SwapArgs; + unsigned BranchOpc; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) - std::swap(Op0, Op1); + std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(Op0, Op1, VT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - if (Predicate == CmpInst::FCMP_UNE) { - // X86 requires a second branch to handle UNE (and OEQ, - // which is mapped to UNE above). + // X86 requires a second branch to handle UNE (and OEQ, which is mapped + // to UNE above). + if (NeedExtraBranch) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4)) .addMBB(TrueMBB); } + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + // Emits an unconditional branch to the FalseBB, obtains the branch + // weight, and adds it to the successor list. FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + return true; } } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { @@ -1224,10 +1449,32 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; } // Otherwise do a clumsy setcc and re-test it. @@ -1241,7 +1488,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } @@ -1478,50 +1729,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { return true; } -bool X86FastISel::X86SelectSelect(const Instruction *I) { - MVT VT; - if (!isTypeLegal(I->getType(), VT)) +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) return false; - // We only use cmov here, if we don't have a cmov instruction bail. - if (!Subtarget->hasCMov()) return false; + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) + return false; - unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - if (VT == MVT::i16) { - Opc = X86::CMOVE16rr; - RC = &X86::GR16RegClass; - } else if (VT == MVT::i32) { - Opc = X86::CMOVE32rr; - RC = &X86::GR32RegClass; - } else if (VT == MVT::i64) { - Opc = X86::CMOVE64rr; - RC = &X86::GR64RegClass; - } else { + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + UpdateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); - if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); - if (Op1Reg == 0) return false; - unsigned Op2Reg = getRegForValue(I->getOperand(2)); - if (Op2Reg == 0) return false; - - // Selects operate on i1, however, Op0Reg is 8 bits width and may contain - // garbage. Indeed, only the less significant bit is supposed to be accurate. - // If we read more than the lsb, we may see non-zero values whereas lsb - // is zero. Therefore, we have to truncate Op0Reg to i1 for the select. - // This is achieved by performing TEST against 1. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(Op0Reg).addImm(1); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Op1Reg).addReg(Op2Reg); + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + static unsigned OpcTable[2][2][4] = { + { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, + { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, + { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + }; + + bool HasAVX = Subtarget->hasAVX(); + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; + case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); UpdateValueMap(I, ResultReg); return true; } +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + UpdateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { // fpext from float to double. if (X86ScalarSSEf64 && @@ -1633,8 +2153,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, } unsigned Reg; - bool RV = X86FastEmitLoad(VT, SrcAM, Reg); - RV &= X86FastEmitStore(VT, Reg, DestAM); + bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); + RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; @@ -1646,10 +2166,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } +static bool isCommutativeIntrinsic(IntrinsicInst const &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + return true; + default: + return false; + } +} + bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::frameaddress: { + Type *RetTy = I.getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC = nullptr; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Invalid result type for frameaddress."); + case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; + case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; + } + + // This needs to be set before we call getFrameRegister, otherwise we get + // the wrong frame register. + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const X86RegisterInfo *RegInfo = + static_cast(TM.getRegisterInfo()); + unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + + // Always make a copy of the frame register to to a vreg first, so that we + // never directly reference the frame register (the TwoAddressInstruction- + // Pass doesn't like that). + unsigned SrcReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); + + // Now recursively load from the frame address. + // movq (%rbp), %rax + // movq (%rax), %rax + // movq (%rax), %rax + // ... + unsigned DestReg; + unsigned Depth = cast(I.getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg), SrcReg); + SrcReg = DestReg; + } + + UpdateValueMap(&I, SrcReg); + return true; + } case Intrinsic::memcpy: { const MemCpyInst &MCI = cast(I); // Don't handle volatile or variable length memcpys. @@ -1726,52 +2310,233 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); return true; } - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // FIXME: Should fold immediates. + case Intrinsic::sqrt: { + if (!Subtarget->hasSSE1()) + return false; - // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. - const Function *Callee = I.getCalledFunction(); - Type *RetTy = - cast(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + Type *RetTy = I.getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - const Value *Op1 = I.getArgOperand(0); - const Value *Op2 = I.getArgOperand(1); - unsigned Reg1 = getRegForValue(Op1); - unsigned Reg2 = getRegForValue(Op2); + // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT + // is not generated by FastISel yet. + // FIXME: Update this code once tablegen can handle it. + static const unsigned SqrtOpc[2][2] = { + {X86::SQRTSSr, X86::VSQRTSSr}, + {X86::SQRTSDr, X86::VSQRTSDr} + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; + case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; + } + + const Value *SrcVal = I.getArgOperand(0); + unsigned SrcReg = getRegForValue(SrcVal); - if (Reg1 == 0 || Reg2 == 0) - // FIXME: Handle values *not* in registers. + if (SrcReg == 0) return false; - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::ADD32rr; - else if (VT == MVT::i64) - OpC = X86::ADD64rr; - else + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg); + + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg); + + MIB.addReg(SrcReg); + + UpdateValueMap(&I, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul followed by either seto or setb. + const Function *Callee = I.getCalledFunction(); + auto *Ty = cast(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT < MVT::i8 || VT > MVT::i64) + return false; + + const Value *LHS = I.getArgOperand(0); + const Value *RHS = I.getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa(LHS) && !isa(RHS) && + isCommutativeIntrinsic(I)) + std::swap(LHS, RHS); + + unsigned BaseOpc, CondOpc; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) return false; + bool LHSIsKill = hasTrivialKill(LHS); - // The call to CreateRegs builds two sequential registers, to store the - // both the returned values. - unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg) - .addReg(Reg1).addReg(Reg2); + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (auto const *C = dyn_cast(RHS)) { + ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + C->getZExtValue()); + } - unsigned Opc = X86::SETBr; - if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) - Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), - ResultReg + 1); + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + } + + if (!ResultReg) + return false; + + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); UpdateValueMap(&I, ResultReg, 2); return true; } + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + bool IsInputDouble; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + if (!Subtarget->hasSSE1()) + return false; + IsInputDouble = false; + break; + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (!Subtarget->hasSSE2()) + return false; + IsInputDouble = true; + break; + } + + Type *RetTy = I.getCalledFunction()->getReturnType(); + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + static const unsigned CvtOpc[2][2][2] = { + { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, + { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, + { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, + { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected result type."); + case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; + case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; + } + + // Check if we can fold insertelement instructions into the convert. + const Value *Op = I.getArgOperand(0); + while (auto *IE = dyn_cast(Op)) { + const Value *Index = IE->getOperand(2); + if (!isa(Index)) + break; + unsigned Idx = cast(Index)->getZExtValue(); + + if (Idx == 0) { + Op = IE->getOperand(1); + break; + } + Op = IE->getOperand(0); + } + + unsigned Reg = getRegForValue(Op); + if (Reg == 0) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Reg); + + UpdateValueMap(&I, ResultReg); + return true; + } } } @@ -1794,31 +2559,43 @@ bool X86FastISel::FastLowerArguments() { return false; // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - if (Idx > 6) - return false; - + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; - Type *ArgTy = I->getType(); + Type *ArgTy = Arg.getType(); if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; EVT ArgVT = TLI.getValueType(ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { + default: return false; case MVT::i32: case MVT::i64: + ++GPRCnt; + break; + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasSSE1()) + return false; + ++FPRCnt; break; - default: - return false; } + + if (GPRCnt > 6) + return false; + + if (FPRCnt > 8) + return false; } static const MCPhysReg GPR32ArgRegs[] = { @@ -1827,24 +2604,33 @@ bool X86FastISel::FastLowerArguments() { static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; - Idx = 0; - const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32); - const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; - const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; - unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(Arg.getType()); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned SrcReg; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; + case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; + case MVT::f32: // fall-through + case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; + } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(DstReg, getKillRegState(true)); - UpdateValueMap(I, ResultReg); + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + UpdateValueMap(&Arg, ResultReg); } return true; } @@ -2147,7 +2933,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (!X86FastEmitStore(ArgVT, ArgVal, AM)) return false; } else { - if (!X86FastEmitStore(ArgVT, Arg, AM)) + if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM)) return false; } } @@ -2430,7 +3216,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { return 0; } - // Materialize addresses with LEA instructions. + // Materialize addresses with LEA/MOV instructions. if (isa(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { @@ -2440,10 +3226,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; - Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + Opc = X86::MOV64ri; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg).addGlobalAddress(cast(C)); + } else { + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); + } return ResultReg; } return 0; @@ -2544,8 +3339,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) { + const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; - if (!X86SelectAddress(LI->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; const X86InstrInfo &XII = (const X86InstrInfo&)TII; @@ -2553,13 +3349,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = DL.getABITypeAlignment(LI->getType()); + SmallVector AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); - if (!Result) return false; + if (!Result) + return false; + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); return true; -- cgit v1.1