diff options
-rw-r--r-- | lib/Target/X86/X86ISelDAGToDAG.cpp | 77 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 170 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 8 | ||||
-rw-r--r-- | test/CodeGen/X86/fp-stack-direct-ret.ll | 11 | ||||
-rw-r--r-- | test/CodeGen/X86/fp-stack-ret-conv.ll | 17 | ||||
-rw-r--r-- | test/CodeGen/X86/pr1505b.ll | 2 |
6 files changed, 157 insertions, 128 deletions
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 9887bcc..53cd612 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -156,7 +156,8 @@ namespace { bool TryFoldLoad(SDOperand P, SDOperand N, SDOperand &Base, SDOperand &Scale, SDOperand &Index, SDOperand &Disp); - void InstructionSelectPreprocess(SelectionDAG &DAG); + void PreprocessForRMW(SelectionDAG &DAG); + void PreprocessForFPConvert(SelectionDAG &DAG); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -350,9 +351,10 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load, Store.getOperand(2), Store.getOperand(3)); } -/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction -/// selector to pick more load-modify-store instructions. This is a common -/// case: +/// PreprocessForRMW - Preprocess the DAG to make instruction selection better. +/// This is only run if not in -fast mode (aka -O0). +/// This allows the instruction selector to pick more read-modify-write +/// instructions. This is a common case: /// /// [Load chain] /// ^ @@ -389,7 +391,7 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load, /// \ / /// \ / /// [Store] -void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) { +void X86DAGToDAGISel::PreprocessForRMW(SelectionDAG &DAG) { for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) { if (!ISD::isNON_TRUNCStore(I)) @@ -459,6 +461,66 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) { } } + +/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend +/// nodes that target the FP stack to be store and load to the stack. This is a +/// gross hack. We would like to simply mark these as being illegal, but when +/// we do that, legalize produces these when it expands calls, then expands +/// these in the same legalize pass. We would like dag combine to be able to +/// hack on these between the call expansion and the node legalization. As such +/// this pass basically does "really late" legalization of these inline with the +/// X86 isel pass. +void X86DAGToDAGISel::PreprocessForFPConvert(SelectionDAG &DAG) { + for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), + E = DAG.allnodes_end(); I != E; ) { + SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + MVT::ValueType SrcVT = N->getOperand(0).getValueType(); + MVT::ValueType DstVT = N->getValueType(0); + bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; + + // If this is an FPStack extension (but not a truncation), it is a noop. + if (!SrcIsSSE && !DstIsSSE && N->getOpcode() == ISD::FP_EXTEND) + continue; + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT::ValueType MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDOperand MemTmp = DAG.CreateStackTemporary(MemVT); + + // FIXME: optimize the case where the src/dest is a load or store? + SDOperand Store = DAG.getTruncStore(DAG.getEntryNode(), N->getOperand(0), + MemTmp, NULL, 0, MemVT); + SDOperand Result = DAG.getExtLoad(ISD::EXTLOAD, DstVT, Store, MemTmp, + NULL, 0, MemVT); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + DAG.ReplaceAllUsesOfValueWith(SDOperand(N, 0), Result); + + // Now that we did that, the node is dead. Increment the iterator to the + // next node to process, then delete N. + ++I; + DAG.DeleteNode(N); + } +} + /// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel /// when it has created a SelectionDAG for us to codegen. void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { @@ -466,7 +528,10 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { MachineFunction::iterator FirstMBB = BB; if (!FastISel) - InstructionSelectPreprocess(DAG); + PreprocessForRMW(DAG); + + // FIXME: This should only happen when not -fast. + PreprocessForFPConvert(DAG); // Codegen the basic block. #ifndef NDEBUG diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9140539..9ca99fe 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -47,6 +47,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + bool Fast = false; RegInfo = TM.getRegisterInfo(); @@ -355,13 +356,15 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - // Conversions to long double (in X87) go through memory. - setConvertAction(MVT::f32, MVT::f80, Expand); - setConvertAction(MVT::f64, MVT::f80, Expand); - - // Conversions from long double (in X87) go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f80, MVT::f64, Expand); + // Floating truncations from f80 and extensions to f80 go through memory. + // If optimizing, we lie about this though and handle it in + // InstructionSelectPreprocess so that dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f32, MVT::f80, Expand); + setConvertAction(MVT::f64, MVT::f80, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f64, Expand); + } } else if (X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. @@ -395,15 +398,17 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - // SSE->x87 conversions go through memory. - setConvertAction(MVT::f32, MVT::f64, Expand); - setConvertAction(MVT::f32, MVT::f80, Expand); - - // x87->SSE truncations need to go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f64, MVT::f32, Expand); - // And x87->x87 truncations also. - setConvertAction(MVT::f80, MVT::f64, Expand); + // SSE <-> X87 conversions go through memory. If optimizing, we lie about + // this though and handle it in InstructionSelectPreprocess so that + // dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f32, MVT::f64, Expand); + setConvertAction(MVT::f32, MVT::f80, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f64, MVT::f32, Expand); + // And x87->x87 truncations also. + setConvertAction(MVT::f80, MVT::f64, Expand); + } if (!UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); @@ -420,10 +425,14 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - // Floating truncations need to go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f64, MVT::f32, Expand); - setConvertAction(MVT::f80, MVT::f64, Expand); + // Floating truncations go through memory. If optimizing, we lie about + // this though and handle it in InstructionSelectPreprocess so that + // dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f64, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f64, Expand); + } if (!UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); @@ -647,7 +656,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) } setTruncStoreAction(MVT::f64, MVT::f32, Expand); - + // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); @@ -808,30 +817,10 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { // a register. SDOperand Value = Op.getOperand(1); - // If this is an FP return with ScalarSSE, we need to move the value from - // an XMM register onto the fp-stack. - if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { - SDOperand MemLoc; - - // If this is a load into a scalarsse value, don't store the loaded value - // back to the stack, only to reload it: just replace the scalar-sse load. - if (ISD::isNON_EXTLoad(Value.Val) && - Chain.reachesChainWithoutSideEffects(Value.getOperand(0))) { - Chain = Value.getOperand(0); - MemLoc = Value.getOperand(1); - } else { - // Spill the value to memory and reload it into top of stack. - unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8; - MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); - MemLoc = DAG.getFrameIndex(SSFI, getPointerTy()); - Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0); - } - SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other); - SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())}; - Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3); - Chain = Value.getValue(1); - } + // an XMM register onto the fp-stack. Do this with an FP_EXTEND to f80. + // This will get legalized into a load/store if it can't get optimized away. + if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) + Value = DAG.getNode(ISD::FP_EXTEND, MVT::f80, Value); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); SDOperand Ops[] = { Chain, Value }; @@ -876,87 +865,26 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, // Copies from the FP stack are special, as ST0 isn't a valid register // before the fp stackifier runs. - // Copy ST0 into an RFP register with FP_GET_RESULT. - SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag); + // Copy ST0 into an RFP register with FP_GET_RESULT. If this will end up + // in an SSE register, copy it out as F80 and do a truncate, otherwise use + // the specified value type. + MVT::ValueType GetResultTy = RVLocs[0].getValVT(); + if (isScalarFPTypeInSSEReg(GetResultTy)) + GetResultTy = MVT::f80; + SDVTList Tys = DAG.getVTList(GetResultTy, MVT::Other, MVT::Flag); + SDOperand GROps[] = { Chain, InFlag }; SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2); Chain = RetVal.getValue(1); InFlag = RetVal.getValue(2); - - // If we are using ScalarSSE, store ST(0) to the stack and reload it into - // an XMM register. - if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { - SDOperand StoreLoc; - const Value *SrcVal = 0; - int SrcValOffset = 0; - MVT::ValueType RetStoreVT = RVLocs[0].getValVT(); - - // Determine where to store the value. If the call result is directly - // used by a store, see if we can store directly into the location. In - // this case, we'll end up producing a fst + movss[load] + movss[store] to - // the same location, and the two movss's will be nuked as dead. This - // optimizes common things like "*D = atof(..)" to not need an - // intermediate stack slot. - if (SDOperand(TheCall, 0).hasOneUse() && - SDOperand(TheCall, 1).hasOneUse()) { - // In addition to direct uses, we also support a FP_ROUND that uses the - // value, if it is directly stored somewhere. - SDNode *User = *TheCall->use_begin(); - if (User->getOpcode() == ISD::FP_ROUND && User->hasOneUse()) - User = *User->use_begin(); - - // Ok, we have one use of the value and one use of the chain. See if - // they are the same node: a store. - if (StoreSDNode *N = dyn_cast<StoreSDNode>(User)) { - // Verify that the value being stored is either the call or a - // truncation of the call. - SDNode *StoreVal = N->getValue().Val; - if (StoreVal == TheCall) - ; // ok. - else if (StoreVal->getOpcode() == ISD::FP_ROUND && - StoreVal->hasOneUse() && - StoreVal->getOperand(0).Val == TheCall) - ; // ok. - else - N = 0; // not ok. - - if (N && N->getChain().Val == TheCall && - !N->isVolatile() && !N->isTruncatingStore() && - N->getAddressingMode() == ISD::UNINDEXED) { - StoreLoc = N->getBasePtr(); - SrcVal = N->getSrcValue(); - SrcValOffset = N->getSrcValueOffset(); - RetStoreVT = N->getValue().getValueType(); - } - } - } - // If we weren't able to optimize the result, just create a temporary - // stack slot. - if (StoreLoc.Val == 0) { - MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); - StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy()); - } - - // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This - // shouldn't be necessary except that RFP cannot be live across - // multiple blocks (which could happen if a select gets lowered into - // multiple blocks and scheduled in between them). When stackifier is - // fixed, they can be uncoupled. - SDOperand Ops[] = { - Chain, RetVal, StoreLoc, DAG.getValueType(RetStoreVT), InFlag - }; - Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5); - RetVal = DAG.getLoad(RetStoreVT, Chain, - StoreLoc, SrcVal, SrcValOffset); - Chain = RetVal.getValue(1); - - // If we optimized a truncate, then extend the result back to its desired - // type. - if (RVLocs[0].getValVT() != RetStoreVT) - RetVal = DAG.getNode(ISD::FP_EXTEND, RVLocs[0].getValVT(), RetVal); - } + // If we want the result in an SSE register, use an FP_TRUNCATE to get it + // there. + if (GetResultTy != RVLocs[0].getValVT()) + RetVal = DAG.getNode(ISD::FP_ROUND, RVLocs[0].getValVT(), RetVal, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + ResultVals.push_back(RetVal); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a73d5af..30d088c 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2734,6 +2734,14 @@ def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +// extload f32 -> f64. This matches load+fextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag combine. +// Since these loads aren't folded into the fextend, we have to match it +// explicitly here. +let Predicates = [HasSSE2] in + def : Pat<(fextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>; + // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or // 16-bits matter. def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, diff --git a/test/CodeGen/X86/fp-stack-direct-ret.ll b/test/CodeGen/X86/fp-stack-direct-ret.ll new file mode 100644 index 0000000..78be2a3 --- /dev/null +++ b/test/CodeGen/X86/fp-stack-direct-ret.ll @@ -0,0 +1,11 @@ +; RUN: llvm-as < %s | llc -march=x86 | not grep fstp +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | not grep movsd + +declare double @foo() + +define double @bar() { +entry: + %tmp5 = tail call double @foo() + ret double %tmp5 +} + diff --git a/test/CodeGen/X86/fp-stack-ret-conv.ll b/test/CodeGen/X86/fp-stack-ret-conv.ll new file mode 100644 index 0000000..5254e1c --- /dev/null +++ b/test/CodeGen/X86/fp-stack-ret-conv.ll @@ -0,0 +1,17 @@ +; RUN: llvm-as < %s | llc -mcpu=yonah | grep cvtss2sd +; RUN: llvm-as < %s | llc -mcpu=yonah | grep fstps +; RUN: llvm-as < %s | llc -mcpu=yonah | not grep cvtsd2ss + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" +target triple = "i686-apple-darwin8" + +define void @test(double *%b) { +entry: + %tmp13 = tail call double @foo() + %tmp1314 = fptrunc double %tmp13 to float ; <float> [#uses=1] + %tmp3940 = fpext float %tmp1314 to double ; <double> [#uses=1] + volatile store double %tmp3940, double* %b + ret void +} + +declare double @foo() diff --git a/test/CodeGen/X86/pr1505b.ll b/test/CodeGen/X86/pr1505b.ll index d5e3507..73cb23e 100644 --- a/test/CodeGen/X86/pr1505b.ll +++ b/test/CodeGen/X86/pr1505b.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 3 +; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 4 ; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstps | count 3 ; PR1505 |