diff options
Diffstat (limited to 'lib/Target/R600/AMDGPUISelLowering.cpp')
-rw-r--r-- | lib/Target/R600/AMDGPUISelLowering.cpp | 514 |
1 files changed, 476 insertions, 38 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index c4d75ff..183725c 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -21,6 +21,7 @@ #include "AMDILIntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -32,8 +33,9 @@ using namespace llvm; static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } @@ -43,6 +45,8 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { + Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); + // Initialize target lowering borrowed from AMDIL InitAMDILLowering(); @@ -59,6 +63,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); // The hardware supports ROTR, but not ROTL setOperationAction(ISD::ROTL, MVT::i32, Expand); @@ -93,10 +98,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + // XXX: This can be change to Custom, once ExpandVectorStores can // handle 64-bit stores. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); @@ -117,8 +128,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); @@ -133,6 +150,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); setOperationAction(ISD::FNEG, MVT::v4f32, Expand); @@ -168,6 +187,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); } @@ -182,12 +202,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); } + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); } //===----------------------------------------------------------------------===// @@ -225,6 +264,43 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { return VT == MVT::f32; } +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + const DataLayout *DL = getDataLayout(); + unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); + unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + //===---------------------------------------------------------------------===// // TargetLowering Callbacks //===---------------------------------------------------------------------===// @@ -254,8 +330,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) switch (Op.getOpcode()) { default: Op.getNode()->dump(); - assert(0 && "Custom lowering code for this" - "instruction is not implemented yet!"); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); break; // AMDIL DAG lowering case ISD::SDIV: return LowerSDIV(Op, DAG); @@ -273,32 +349,124 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) return Op; } +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + + default: + return; + } +} + +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout *TD = getTargetMachine().getDataLayout(); + SDLoc DL(InitPtr); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { + EVT VT = EVT::getEVT(CI->getType()); + PointerType *PtrTy = PointerType::get(CI->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CI->getType())); + } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CFP->getType())); + } else if (Init->getType()->isAggregateType()) { + EVT PtrVT = InitPtr.getValueType(); + unsigned NumElements = Init->getType()->getArrayNumElements(); + SmallVector<SDValue, 8> Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize( + Init->getType()->getArrayElementType()), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i), + GV, Ptr, Chain, DAG)); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], + Chains.size()); + } else { + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); + } +} + SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { const DataLayout *TD = getTargetMachine().getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = G->getGlobal(); - assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS); - // XXX: What does the value of G->getOffset() mean? - assert(G->getOffset() == 0 && + switch (G->getAddressSpace()) { + default: llvm_unreachable("Global Address lowering not implemented for this " + "address space"); + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); - const GlobalValue *GV = G->getGlobal(); + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; - MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; - } else { - Offset = MFI->LocalMemoryObjects[GV]; + return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = TD->getTypeAllocSize(EltType); + unsigned Alignment = TD->getPrefTypeAlignment(EltType); + + const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV); + const Constant *Init = Var->getInitializer(); + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + SmallVector<SDNode*, 8> WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size()); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), + getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + } } - - return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); } void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG, @@ -391,6 +559,30 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDGPU_umin: return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_round_nearest: return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); } @@ -455,7 +647,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - assert(0 && "Operation should already be optimised !"); + llvm_unreachable("Operation should already be optimised!"); case ISD::SETULE: case ISD::SETULT: case ISD::SETOLE: @@ -479,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: - assert(0 && "Invalid setcc condcode !"); + llvm_unreachable("Invalid setcc condcode!"); } return Op; } @@ -503,8 +695,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, MemEltVT, Load->isVolatile(), Load->isNonTemporal(), Load->getAlignment())); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0], - Loads.size()); + return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), + Loads.data(), Loads.size()); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, @@ -513,9 +705,9 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, EVT MemVT = Store->getMemoryVT(); unsigned MemBits = MemVT.getSizeInBits(); - // Byte stores are really expensive, so if possible, try to pack - // 32-bit vector truncatating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. if (!MemVT.isVector() || MemBits > 32) { return SDValue(); } @@ -528,17 +720,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, unsigned MemEltBits = MemEltVT.getSizeInBits(); unsigned MemNumElements = MemVT.getVectorNumElements(); EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); - SDValue Mask; - switch(MemEltBits) { - case 8: - Mask = DAG.getConstant(0xFF, PackedVT); - break; - case 16: - Mask = DAG.getConstant(0xFFFF, PackedVT); - break; - default: - llvm_unreachable("Cannot lower this vector store"); - } + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT); + SDValue PackedValue; for (unsigned i = 0; i < MemNumElements; ++i) { EVT ElemVT = VT.getVectorElementType(); @@ -586,18 +769,120 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); } +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { + // We can do the extload to 32-bits, and then need to separately extend to + // 64-bits. + + SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, + Load->getChain(), + Load->getBasePtr(), + MemVT, + Load->getMemOperand()); + return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + } + + // Lower loads constant address space global variable loads + if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) { + + SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + } + + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + EVT MemEltVT = MemVT.getScalarType(); + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } + + return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); +} + SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); if (Result.getNode()) { return Result; } StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Chain = Store->getChain(); if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { return SplitVectorStore(Op, DAG); } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); + } return SDValue(); } @@ -726,6 +1011,101 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, } +SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue Shift = DAG.getConstant(BitsDiff, VT); + // Shift left by 'Shift' bits. + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); + // Signed shift Right by 'Shift' bits. + return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits(); + unsigned DestBits = ScalarVT.getSizeInBits(); + unsigned BitsDiff = DestBits - SrcBits; + + if (!Subtarget->hasBFE()) + return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); + + SDValue Src = Op.getOperand(0); + if (VT.isVector()) { + SDLoc DL(Op); + // Need to scalarize this, and revisit each of the scalars later. + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector<SDValue, 8> Args; + ExtractVectorElements(Src, DAG, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); + } + + if (SrcBits == 32) { + SDLoc DL(Op); + + // If the source is 32-bits, this is really half of a 2-register pair, and + // we need to discard the unused half of the pair. + SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc); + } + + unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1; + + // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it + // might not be worth the effort, and will need to expand to shifts when + // fixing SGPR copies. + if (SrcBits < 32 && DestBits <= 32) { + SDLoc DL(Op); + MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); + + if (DestBits != 32) + Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src); + + // FIXME: This should use TargetConstant, but that hits assertions for + // Evergreen. + SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT, + Op.getOperand(0), // Operand + DAG.getConstant(0, ExtVT), // Offset + DAG.getConstant(SrcBits, ExtVT)); // Width + + // Truncate to the original type if necessary. + if (ScalarVT == MVT::i32) + return Ext; + return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext); + } + + // For small types, extend to 32-bits first. + if (SrcBits < 32) { + SDLoc DL(Op); + MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); + + SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src); + SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32, + DL, + ExtVT, + TruncSrc, // Operand + DAG.getConstant(0, ExtVT), // Offset + DAG.getConstant(SrcBits, ExtVT)); // Width + + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32); + } + + // For everything else, use the standard bitshift expansion. + return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// @@ -818,7 +1198,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -833,3 +1218,56 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } } + +static void computeMaskedBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth); + DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::SMAX: + case AMDGPUISD::UMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMIN: + computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } +} |