diff options
author | Evan Cheng <evan.cheng@apple.com> | 2009-12-09 21:00:30 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2009-12-09 21:00:30 +0000 |
commit | c363094e04df621d41ca570eb2a7bf8826bb8c1a (patch) | |
tree | f471aca10ab30ffc20bb95bc94a18fed3c50357e /lib/Target/X86 | |
parent | 89452f7386540ca83e8991e74f1d74bbe7271922 (diff) | |
download | external_llvm-c363094e04df621d41ca570eb2a7bf8826bb8c1a.zip external_llvm-c363094e04df621d41ca570eb2a7bf8826bb8c1a.tar.gz external_llvm-c363094e04df621d41ca570eb2a7bf8826bb8c1a.tar.bz2 |
Optimize splat of a scalar load into a shuffle of a vector load when it's legal. e.g.
vector_shuffle (scalar_to_vector (i32 load (ptr + 4))), undef, <0, 0, 0, 0>
=>
vector_shuffle (v4i32 load ptr), undef, <1, 1, 1, 1>
iff ptr is 16-byte aligned (or can be made into 16-byte aligned).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@90984 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 91 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 2 |
3 files changed, 93 insertions, 4 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38e1ffe..8284b17 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3344,6 +3344,82 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, } SDValue +X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) { + + // Check if the scalar load can be widened into a vector load. And if + // the address is "base + cst" see if the cst can be "absorbed" into + // the shuffle mask. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { + SDValue Ptr = LD->getBasePtr(); + if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + return SDValue(); + EVT PVT = LD->getValueType(0); + if (PVT != MVT::i32 && PVT != MVT::f32) + return SDValue(); + + int FI = -1; + int64_t Offset = 0; + if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { + FI = FINode->getIndex(); + Offset = 0; + } else if (Ptr.getOpcode() == ISD::ADD && + isa<ConstantSDNode>(Ptr.getOperand(1)) && + isa<FrameIndexSDNode>(Ptr.getOperand(0))) { + FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); + Offset = Ptr.getConstantOperandVal(1); + Ptr = Ptr.getOperand(0); + } else { + return SDValue(); + } + + SDValue Chain = LD->getChain(); + // Make sure the stack object alignment is at least 16. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (DAG.InferPtrAlignment(Ptr) < 16) { + if (MFI->isFixedObjectIndex(FI)) { + // Can't change the alignment. Reference stack + offset explicitly + // if stack pointer is at least 16-byte aligned. + unsigned StackAlign = Subtarget->getStackAlignment(); + if (StackAlign < 16) + return SDValue(); + Offset = MFI->getObjectOffset(FI) + Offset; + SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + getPointerTy()); + Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + DAG.getConstant(Offset & ~15, getPointerTy())); + Offset %= 16; + } else { + MFI->setObjectAlignment(FI, 16); + } + } + + // (Offset % 16) must be multiple of 4. Then address is then + // Ptr + (Offset & ~15). + if (Offset < 0) + return SDValue(); + if ((Offset % 16) & 3) + return SDValue(); + int64_t StartOffset = Offset & ~15; + if (StartOffset) + Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); + + int EltNo = (Offset - StartOffset) >> 2; + int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; + EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; + SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0); + // Canonicalize it to a v4i32 shuffle. + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getVectorShuffle(MVT::v4i32, dl, V1, + DAG.getUNDEF(MVT::v4i32), &Mask[0])); + } + + return SDValue(); +} + +SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); // All zero's are handled with pxor, all one's are handled with pcmpeqd. @@ -3486,8 +3562,19 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } // Splat is obviously ok. Let legalizer expand it to a shuffle. - if (Values.size() == 1) + if (Values.size() == 1) { + if (EVTBits == 32) { + // Instead of a shuffle like this: + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> + // Check if it's possible to issue this instead. + // shuffle (vload ptr)), undef, <1, 1, 1, 1> + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + if (Op.getNode()->isOnlyUserOf(Item.getNode())) + return LowerAsSplatVectorLoad(Item, VT, dl, DAG); + } return SDValue(); + } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. @@ -4278,7 +4365,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned ShAmt = 0; SDValue ShVal; bool isShift = getSubtarget()->hasSSE2() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7b4ab62..89b773d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -626,7 +626,9 @@ namespace llvm { std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned); - + + SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG); SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG); SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG); SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index dfdd4ce..62841f8 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2083,7 +2083,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (v4i32 (pshufd:$src2 - (bc_v4i32(memopv2i64 addr:$src1)), + (bc_v4i32 (memopv2i64 addr:$src1)), (undef))))]>; } |