aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp383
1 files changed, 309 insertions, 74 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cd1d201..1cdf2b6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// X86 is weird, it always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
-
+
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
if (Subtarget->is64Bit())
@@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+ // Lower this to FGETSIGNx86 plus an AND.
+ setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+ setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , MVT::f64, Expand);
setOperationAction(ISD::FCOS , MVT::f64, Expand);
@@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// Can turn SHL into an integer multiply.
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
- setOperationAction(ISD::SRL, MVT::v4i32, Legal);
// i8 and i16 vectors are custom , because the source register and source
// source memory operand types are not the same width. f32 vectors are
@@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
}
+ if (Subtarget->hasSSE2()) {
+ setOperationAction(ISD::SRL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i32, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v4i32, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i16, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v4i32, Custom);
+ setOperationAction(ISD::SRA, MVT::v8i16, Custom);
+ }
+
if (Subtarget->hasSSE42())
setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
@@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SINT_TO_FP);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
@@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
setPrefLoopAlignment(16);
benefitFromCodePlacementOpt = true;
+
+ setPrefFunctionAlignment(4);
}
@@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
}
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
- return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
-}
-
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
X86TargetLowering::findRepresentativeClass(EVT VT) const{
@@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
#include "X86GenCallingConv.inc"
bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
@@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
@@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget->is64Bit();
- CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
- RVLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
@@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
DAG.getIntPtrConstant(1));
- } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
- // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
- if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
- Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
- MVT::v2i64, InFlag).getValue(1);
- Val = Chain.getValue(0);
- Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
- Val, DAG.getConstant(0, MVT::i64));
- } else {
- Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
- MVT::i64, InFlag).getValue(1);
- Val = Chain.getValue(0);
- }
- Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val);
} else {
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
CopyVT, InFlag).getValue(1);
@@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
@@ -1952,7 +1953,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
return SDValue(OutRetAddr.getNode(), 1);
}
-/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
@@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
@@ -2043,7 +2044,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
SDValue RetAddrFrIdx;
- // Load return adress for tail calls.
+ // Load return address for tail calls.
if (isTailCall && FPDiff)
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
@@ -2200,7 +2201,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- // Do not flag preceeding copytoreg stuff together with the following stuff.
+ // Do not flag preceding copytoreg stuff together with the following stuff.
InFlag = SDValue();
if (GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2270,6 +2271,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
const GlobalValue *GV = G->getGlobal();
if (!GV->hasDLLImportLinkage()) {
unsigned char OpFlags = 0;
+ bool ExtraLoad = false;
+ unsigned WrapperKind = ISD::DELETED_NODE;
// On ELF targets, in both X86-64 and X86-32 mode, direct calls to
// external symbols most go through the PLT in PIC mode. If the symbol
@@ -2281,15 +2284,34 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
OpFlags = X86II::MO_PLT;
} else if (Subtarget->isPICStyleStubAny() &&
(GV->isDeclaration() || GV->isWeakForLinker()) &&
- Subtarget->getDarwinVers() < 9) {
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
// PC-relative references to external symbols should go through $stub,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
OpFlags = X86II::MO_DARWIN_STUB;
+ } else if (Subtarget->isPICStyleRIPRel() &&
+ isa<Function>(GV) &&
+ cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
+ // If the function is marked as non-lazy, generate an indirect call
+ // which loads from the GOT directly. This avoids runtime overhead
+ // at the cost of eager binding (and one extra byte of encoding).
+ OpFlags = X86II::MO_GOTPCREL;
+ WrapperKind = X86ISD::WrapperRIP;
+ ExtraLoad = true;
}
Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
G->getOffset(), OpFlags);
+
+ // Add a wrapper if needed.
+ if (WrapperKind != ISD::DELETED_NODE)
+ Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+ // Add extra indirection if needed.
+ if (ExtraLoad)
+ Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+ MachinePointerInfo::getGOT(),
+ false, false, 0);
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned char OpFlags = 0;
@@ -2300,7 +2322,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
getTargetMachine().getRelocationModel() == Reloc::PIC_) {
OpFlags = X86II::MO_PLT;
} else if (Subtarget->isPICStyleStubAny() &&
- Subtarget->getDarwinVers() < 9) {
+ (!Subtarget->getTargetTriple().isMacOSX() ||
+ Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
// PC-relative references to external symbols should go through $stub,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
@@ -2528,16 +2551,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (RegInfo->needsStackRealignment(MF))
return false;
- // Do not sibcall optimize vararg calls unless the call site is not passing
- // any arguments.
- if (isVarArg && !Outs.empty())
- return false;
-
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
return false;
+ // Do not sibcall optimize vararg calls unless all arguments are passed via
+ // registers.
+ if (isVarArg && !Outs.empty()) {
+
+ // Optimizing for varargs on Win64 is unlikely to be safe without
+ // additional testing.
+ if (Subtarget->isTargetWin64())
+ return false;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
+
+ CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+ if (!ArgLocs[i].isRegLoc())
+ return false;
+ }
+
// If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
// Therefore if it's not used by the call it is not safe to optimize this into
// a sibcall.
@@ -2550,8 +2587,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
}
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CalleeCC, false, getTargetMachine(),
- RVLocs, *DAG.getContext());
+ CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
@@ -2564,13 +2601,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// results are returned in the same way as what the caller expects.
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
- CCState CCInfo1(CalleeCC, false, getTargetMachine(),
- RVLocs1, *DAG.getContext());
+ CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs1, *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
SmallVector<CCValAssign, 16> RVLocs2;
- CCState CCInfo2(CallerCC, false, getTargetMachine(),
- RVLocs2, *DAG.getContext());
+ CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+ getTargetMachine(), RVLocs2, *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
if (RVLocs1.size() != RVLocs2.size())
@@ -2596,8 +2633,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
- ArgLocs, *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+ getTargetMachine(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
if (Subtarget->isTargetWin64()) {
@@ -4018,7 +4055,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
/// getNumOfConsecutiveZeros - Return the number of elements of a vector
/// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two diferent directions, from left or right.
+/// search can start in two different directions, from left or right.
static
unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
bool ZerosFromLeft, SelectionDAG &DAG) {
@@ -6617,9 +6654,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
}
-/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
/// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
@@ -6708,12 +6745,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
unsigned ByteSize = SrcVT.getSizeInBits()/8;
- int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
- MachineMemOperand *MMO =
- DAG.getMachineFunction()
- .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOLoad, ByteSize, ByteSize);
-
+ FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+ MachineMemOperand *MMO;
+ if (FI) {
+ int SSFI = FI->getIndex();
+ MMO =
+ DAG.getMachineFunction()
+ .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+ MachineMemOperand::MOLoad, ByteSize, ByteSize);
+ } else {
+ MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+ StackSlot = StackSlot.getOperand(1);
+ }
SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
X86ISD::FILD, DL,
@@ -7204,6 +7247,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
}
+SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+
+ // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+ SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+ DAG.getConstant(1, VT));
+ return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8779,16 +8833,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
return Res;
}
-SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
LLVMContext *Context = DAG.getContext();
- assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+ // Must have SSE2.
+ if (!Subtarget->hasSSE2()) return SDValue();
+
+ // Optimize shl/srl/sra with constant shift amount.
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+ uint64_t ShiftAmt = C->getZExtValue();
+
+ if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+ if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+ R, DAG.getConstant(ShiftAmt, MVT::i32));
+ }
+ }
+
+ // Lower SHL with variable shift amount.
+ // Cannot lower SHL without SSE4.1 or later.
+ if (!Subtarget->hasSSE41()) return SDValue();
- if (VT == MVT::v4i32) {
+ if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
@@ -8807,7 +8916,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
- if (VT == MVT::v16i8) {
+ if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
// a = a << 5;
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
@@ -9112,7 +9221,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
- case ISD::SRL_PARTS: return LowerShift(Op, DAG);
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
@@ -9120,6 +9229,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FABS: return LowerFABS(Op, DAG);
case ISD::FNEG: return LowerFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::VSETCC: return LowerVSETCC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
@@ -9140,7 +9250,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
- case ISD::SHL: return LowerSHL(Op, DAG);
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::SHL: return LowerShift(Op, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
@@ -9307,6 +9419,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
+ case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd";
+ case X86ISD::FSETCCss: return "X86ISD::FSETCCss";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -10984,14 +11098,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
UE = Uses.end(); UI != UE; ++UI) {
SDNode *Extract = *UI;
- // Compute the element's address.
+ // cOMpute the element's address.
SDValue Idx = Extract->getOperand(1);
unsigned EltSize =
InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
- SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
+ SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
StackPtr, OffsetVal);
// Load the scalar.
@@ -11264,15 +11378,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
return SDValue();
+ SDValue FalseOp = N->getOperand(0);
+ SDValue TrueOp = N->getOperand(1);
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+ SDValue Cond = N->getOperand(3);
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ switch (Cond.getOpcode()) {
+ default: break;
+ case X86ISD::BSR:
+ case X86ISD::BSF:
+ // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+ if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+ return (CC == X86::COND_E) ? FalseOp : TrueOp;
+ }
+ }
+
// If this is a select between two integer constants, try to do some
// optimizations. Note that the operands are ordered the opposite of SELECT
// operands.
- if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
// larger than FalseC (the false value).
- X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
-
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueC, FalseC);
@@ -11282,7 +11409,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// This is efficient for any integer data type (including i8/i16) and
// shift amount.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
- SDValue Cond = N->getOperand(3);
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(CC, MVT::i8), Cond);
@@ -11300,7 +11426,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
// for any integer data type, including i8/i16.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
- SDValue Cond = N->getOperand(3);
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(CC, MVT::i8), Cond);
@@ -11339,7 +11464,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
if (isFastMultiplier) {
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
- SDValue Cond = N->getOperand(3);
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(CC, MVT::i8), Cond);
// Zero extend the condition if needed.
@@ -11574,12 +11698,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
}
+// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends. Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ unsigned opcode;
+
+ // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+ // we're requiring SSE2 for both.
+ if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue CMP0 = N0->getOperand(1);
+ SDValue CMP1 = N1->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // The SETCCs should both refer to the same CMP.
+ if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ return SDValue();
+
+ SDValue CMP00 = CMP0->getOperand(0);
+ SDValue CMP01 = CMP0->getOperand(1);
+ EVT VT = CMP00.getValueType();
+
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ bool ExpectingFlags = false;
+ // Check for any users that want flags:
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end();
+ !ExpectingFlags && UI != UE; ++UI)
+ switch (UI->getOpcode()) {
+ default:
+ case ISD::BR_CC:
+ case ISD::BRCOND:
+ case ISD::SELECT:
+ ExpectingFlags = true;
+ break;
+ case ISD::CopyToReg:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ break;
+ }
+
+ if (!ExpectingFlags) {
+ enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+ if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+ X86::CondCode tmp = cc0;
+ cc0 = cc1;
+ cc1 = tmp;
+ }
+
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+ bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+ X86ISD::NodeType NTOperator = is64BitFP ?
+ X86ISD::FSETCCsd : X86ISD::FSETCCss;
+ // FIXME: need symbolic constants for these magic numbers.
+ // See X86ATTInstPrinter.cpp:printSSECC().
+ unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+ SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+ DAG.getConstant(x86cc, MVT::i8));
+ SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+ OnesOrZeroesF);
+ SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
+ DAG.getConstant(1, MVT::i32));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+ return OneBitOfTruth;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+
// Want to form PANDN nodes, in the hopes of then easily combining them with
// OR and AND nodes to form PBLEND/PSIGN.
EVT VT = N->getValueType(0);
@@ -11609,6 +11815,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
+
EVT VT = N->getValueType(0);
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
return SDValue();
@@ -11976,6 +12186,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) {
+ DebugLoc dl = N->getDebugLoc();
+ SDValue Op0 = N->getOperand(0);
+ // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+ // a 32-bit target where SSE doesn't support i64->FP operations.
+ if (Op0.getOpcode() == ISD::LOAD) {
+ LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+ EVT VT = Ld->getValueType(0);
+ if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+ ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+ !XTLI->getSubtarget()->is64Bit() &&
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+ return FILDChain;
+ }
+ }
+ return SDValue();
+}
+
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -12060,6 +12290,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
@@ -12216,7 +12447,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces.clear();
SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
- // FIXME: this should verify that we are targetting a 486 or better. If not,
+ // FIXME: this should verify that we are targeting a 486 or better. If not,
// we will turn this bswap into something that will be lowered to logical ops
// instead of emitting the bswap asm. For now, we don't support 486 or lower
// so don't worry about this.
@@ -12489,12 +12720,16 @@ LowerXConstraint(EVT ConstraintVT) const {
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
- char Constraint,
+ std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
SDValue Result(0, 0);
- switch (Constraint) {
+ // Only support length 1 constraints for now.
+ if (Constraint.length() > 1) return;
+
+ char ConstraintLetter = Constraint[0];
+ switch (ConstraintLetter) {
default: break;
case 'I':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
@@ -12686,7 +12921,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return std::make_pair(0U, X86::GR8RegisterClass);
if (VT == MVT::i16)
return std::make_pair(0U, X86::GR16RegisterClass);
- if (VT == MVT::i32 || !Subtarget->is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
return std::make_pair(0U, X86::GR32RegisterClass);
return std::make_pair(0U, X86::GR64RegisterClass);
case 'R': // LEGACY_REGS