diff options
-rw-r--r-- | lib/Target/R600/AMDGPUISelLowering.cpp | 61 | ||||
-rw-r--r-- | lib/Target/R600/AMDGPUISelLowering.h | 4 | ||||
-rw-r--r-- | lib/Target/R600/R600ISelLowering.cpp | 8 | ||||
-rw-r--r-- | test/CodeGen/R600/store.ll | 62 |
4 files changed, 134 insertions, 1 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 7ceab2d..78495ca 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -67,6 +67,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f64, Promote); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); @@ -187,6 +194,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: return LowerVectorStore(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); } return Op; @@ -487,6 +495,59 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, 2, DL); } +SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = dyn_cast<StoreSDNode>(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack + // 32-bit vector truncatating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + const SDValue &Value = Store->getValue(); + EVT VT = Value.getValueType(); + const SDValue &Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + SDValue Mask; + switch(MemEltBits) { + case 8: + Mask = DAG.getConstant(0xFF, PackedVT); + break; + case 16: + Mask = DAG.getConstant(0xFFFF, PackedVT); + break; + default: + llvm_unreachable("Cannot lower this vector store"); + } + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + EVT ElemVT = VT.getVectorElementType(); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); + Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); + SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); + Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); + } + } + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 8788c20..e3a0dcc 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -51,6 +51,10 @@ protected: void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue LowerVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + public: AMDGPUTargetLowering(TargetMachine &TM); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index b6b6560..e10af2b 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -1011,10 +1011,15 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Value = Op.getOperand(1); SDValue Ptr = Op.getOperand(2); + SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); - assert(VT == MVT::i32); + assert(VT.bitsLE(MVT::i32)); EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1571,6 +1576,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } } } + case AMDGPUISD::EXPORT: { SDValue Arg = N->getOperand(1); if (Arg.getOpcode() != ISD::BUILD_VECTOR) diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll index cba01a3..f24de04 100644 --- a/test/CodeGen/R600/store.ll +++ b/test/CodeGen/R600/store.ll @@ -63,6 +63,49 @@ entry: ret void } +; EG-CHECK: @store_v2i8 +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK-NOT: MEM_RAT MSKOR +; SI-CHECK: @store_v2i8 +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out + ret void +} + + +; EG-CHECK: @store_v2i16 +; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW +; CM-CHECK: @store_v2i16 +; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD +; SI-CHECK: @store_v2i16 +; SI-CHECK: BUFFER_STORE_DWORD +define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out + ret void +} + +; EG-CHECK: @store_v4i8 +; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW +; CM-CHECK: @store_v4i8 +; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD +; SI-CHECK: @store_v4i8 +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out + ret void +} + ; floating-point store ; EG-CHECK: @store_f32 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 @@ -76,6 +119,25 @@ define void @store_f32(float addrspace(1)* %out, float %in) { ret void } +; EG-CHECK: @store_v4i16 +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK-NOT: MEM_RAT MSKOR +; SI-CHECK: @store_v4i16 +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK-NOT: BUFFER_STORE_BYTE +define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i16> + store <4 x i16> %0, <4 x i16> addrspace(1)* %out + ret void +} + ; vec2 floating-point stores ; EG-CHECK: @store_v2f32 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW |