diff options
author | Chad Rosier <mcrosier@apple.com> | 2012-04-09 20:32:02 +0000 |
---|---|---|
committer | Chad Rosier <mcrosier@apple.com> | 2012-04-09 20:32:02 +0000 |
commit | 7f354557089d93cc681c440f9e064c906e1fbd58 (patch) | |
tree | 2e210b2ea7b3828fefea848ead70dcd2736098c9 | |
parent | f31ceaf8b7b1c5092fe35e20732711f5e1074118 (diff) | |
download | external_llvm-7f354557089d93cc681c440f9e064c906e1fbd58.zip external_llvm-7f354557089d93cc681c440f9e064c906e1fbd58.tar.gz external_llvm-7f354557089d93cc681c440f9e064c906e1fbd58.tar.bz2 |
When performing a truncating store, it's possible to rearrange the data
in-register, such that we can use a single vector store rather then a
series of scalar stores.
For func_4_8 the generated code
vldr d16, LCPI0_0
vmov d17, r0, r1
vadd.i16 d16, d17, d16
vmov.u16 r0, d16[3]
strb r0, [r2, #3]
vmov.u16 r0, d16[2]
strb r0, [r2, #2]
vmov.u16 r0, d16[1]
strb r0, [r2, #1]
vmov.u16 r0, d16[0]
strb r0, [r2]
bx lr
becomes
vldr d16, LCPI0_0
vmov d17, r0, r1
vadd.i16 d16, d17, d16
vuzp.8 d16, d17
vst1.32 {d16[0]}, [r2, :32]
bx lr
I'm not fond of how this combine pessimizes 2012-03-13-DAGCombineBug.ll,
but I couldn't think of a way to judiciously apply this combine.
This
ldrh r0, [r0, #4]
strh r0, [r1]
becomes
vldr d16, [r0]
vmov.u16 r0, d16[2]
vmov.32 d16[0], r0
vuzp.16 d16, d17
vst1.32 {d16[0]}, [r1, :32]
PR11158
rdar://10703339
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154340 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 86 | ||||
-rw-r--r-- | test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll | 3 | ||||
-rw-r--r-- | test/CodeGen/ARM/opt-shuff-tstore.ll | 19 | ||||
-rw-r--r-- | test/CodeGen/ARM/vrev.ll | 6 |
4 files changed, 107 insertions, 7 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 15bd339..e9b1c5f 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7339,8 +7339,92 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { StoreSDNode *St = cast<StoreSDNode>(N); + if (St->isVolatile()) + return SDValue(); + + // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // pack all of the elements in one place. Next, store to memory in fewer + // chunks. SDValue StVal = St->getValue(); - if (!ISD::isNormalStore(St) || St->isVolatile()) + EVT VT = StVal.getValueType(); + if (St->isTruncatingStore() && VT.isVector()) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + DebugLoc DL = St->getDebugLoc(); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + StoreType, ShuffWide, + DAG.getIntPtrConstant(I)); + SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], + Chains.size()); + } + + if (!ISD::isNormalStore(St)) return SDValue(); // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and diff --git a/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll b/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll index 6d596df..6206cd7 100644 --- a/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll +++ b/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll @@ -6,8 +6,7 @@ ; (i32 extload $addr+c*sizeof(i16) define void @test_hi_short3(<3 x i16> * nocapture %srcA, <2 x i16> * nocapture %dst) nounwind { entry: -; CHECK: ldrh [[REG:r[0-9]+]] -; CHECK: strh [[REG]] +; CHECK: vst1.32 %0 = load <3 x i16> * %srcA, align 8 %1 = shufflevector <3 x i16> %0, <3 x i16> undef, <2 x i32> <i32 2, i32 undef> store <2 x i16> %1, <2 x i16> * %dst, align 4 diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll new file mode 100644 index 0000000..b4da552 --- /dev/null +++ b/test/CodeGen/ARM/opt-shuff-tstore.ll @@ -0,0 +1,19 @@ +; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s + +; CHECK: func_4_8 +; CHECK: vst1.32 +; CHECK-NEXT: bx lr +define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) { + %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4> + store <4 x i8> %r, <4 x i8>* %p + ret void +} + +; CHECK: func_2_16 +; CHECK: vst1.32 +; CHECK-NEXT: bx lr +define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) { + %r = add <2 x i16> %param, <i16 1, i16 2> + store <2 x i16> %r, <2 x i16>* %p + ret void +} diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll index e154334..122ec03 100644 --- a/test/CodeGen/ARM/vrev.ll +++ b/test/CodeGen/ARM/vrev.ll @@ -149,12 +149,10 @@ define void @test_with_vcombine(<4 x float>* %v) nounwind { } ; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored -; to <2 x i16> when stored to memory. Currently ARM scalarizes these stores. -; See PR 11158 +; to <2 x i16> when stored to memory. define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { ; CHECK: test_vrev64: -; CHECK: vst1.16 -; CHECK: vst1.16 +; CHECK: vst1.32 entry: %0 = bitcast <4 x i16>* %source to <8 x i16>* %tmp2 = load <8 x i16>* %0, align 4 |