diff options
Diffstat (limited to 'lib/Target/NVPTX')
22 files changed, 2031 insertions, 493 deletions
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td index d78b4e8..93fabf6 100644 --- a/lib/Target/NVPTX/NVPTX.td +++ b/lib/Target/NVPTX/NVPTX.td @@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30", "Target SM 3.0">; def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35", "Target SM 3.5">; +def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50", + "Target SM 5.0">; // PTX Versions def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30", "Use PTX version 3.0">; def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31", "Use PTX version 3.1">; +def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", + "Use PTX version 3.2">; +def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40", + "Use PTX version 4.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -52,6 +58,7 @@ def : Proc<"sm_20", [SM20]>; def : Proc<"sm_21", [SM21]>; def : Proc<"sm_30", [SM30]>; def : Proc<"sm_35", [SM35]>; +def : Proc<"sm_50", [SM50]>; def NVPTXInstrInfo : InstrInfo { diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 4ec575f..decf02a 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -734,23 +734,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { << " func_retval0"; } else { if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, Ty, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } + unsigned totalsz = TD->getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!llvm::getAlign(*F, 0, retAlignment)) retAlignment = TD->getABITypeAlignment(Ty); @@ -1321,6 +1305,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { // external global variable with init -> .visible // external without init -> .extern // appending -> not allowed, assert. +// for any linkage other than +// internal, private, linker_private, +// linker_private_weak, linker_private_weak_def_auto, +// we emit -> .weak. void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, raw_ostream &O) { @@ -1346,6 +1334,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, msg.append(V->getName().str()); msg.append("has unsupported appending linkage type"); llvm_unreachable(msg.c_str()); + } else if (!V->hasInternalLinkage() && + !V->hasPrivateLinkage()) { + O << ".weak "; } } } @@ -1356,10 +1347,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Skip meta data if (GVar->hasSection()) { - if (GVar->getSection() == "llvm.metadata") + if (GVar->getSection() == StringRef("llvm.metadata")) return; } + // Skip LLVM intrinsic global variables + if (GVar->getName().startswith("llvm.") || + GVar->getName().startswith("nvvm.")) + return; + const DataLayout *TD = TM.getDataLayout(); // GlobalVariables are always constant pointers themselves. @@ -1371,6 +1367,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".visible "; else O << ".extern "; + } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() || + GVar->hasAvailableExternallyLinkage() || + GVar->hasCommonLinkage()) { + O << ".weak "; } if (llvm::isTexture(*GVar)) { @@ -1438,7 +1438,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "linear"; break; case 2: - assert(0 && "Anisotropic filtering is not supported"); + llvm_unreachable("Anisotropic filtering is not supported"); default: O << "nearest"; break; @@ -1480,6 +1480,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); + + if (isManaged(*GVar)) { + O << " .attribute(.managed)"; + } + if (GVar->getAlignment() == 0) O << " .align " << (int) TD->getPrefTypeAlignment(ETy); else @@ -1497,13 +1502,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. - if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && - GVar->hasInitializer()) { - const Constant *Initializer = GVar->getInitializer(); - if (!Initializer->isNullValue()) { - O << " = "; - printScalarConstant(Initializer, O); + if (GVar->hasInitializer()) { + if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) { + const Constant *Initializer = GVar->getInitializer(); + // 'undef' is treated as there is no value spefied. + if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) { + O << " = "; + printScalarConstant(Initializer, O); + } + } else { + // The frontend adds zero-initializer to variables that don't have an + // initial value, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue()) { + std::string warnMsg = "initial value of '" + GVar->getName().str() + + "' is not allowed in addrspace(" + + llvm::utostr_32(PTy->getAddressSpace()) + ")"; + report_fatal_error(warnMsg.c_str()); + } } } } else { @@ -1562,7 +1578,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } } @@ -1682,7 +1698,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, O << "]"; break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } return; } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9030584..8b088412 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -26,6 +26,10 @@ using namespace llvm; +NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), + is64bit(STI.is64Bit()) {} + bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { // cvta.local %SP, %SPL; if (is64bit) { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), LocalReg).addImm(MF.getFunctionNumber()); } else { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), LocalReg).addImm(MF.getFunctionNumber()); } } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 2ae6d72..56fb673 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -17,16 +17,12 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { -class NVPTXTargetMachine; - +class NVPTXSubtarget; class NVPTXFrameLowering : public TargetFrameLowering { - NVPTXTargetMachine &tm; bool is64bit; public: - explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit) - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm), - is64bit(_is64bit) {} + explicit NVPTXFrameLowering(NVPTXSubtarget &STI); bool hasFP(const MachineFunction &MF) const override; void emitPrologue(MachineFunction &MF) const override; diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 023dd5e..faa9fdb 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -84,7 +84,7 @@ bool GenericToNVVM::runOnModule(Module &M) { GlobalVariable *GV = I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && - !GV->getName().startswith("llvm.")) { + !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { GlobalVariable *NewGV = new GlobalVariable( M, GV->getType()->getElementType(), GV->isConstant(), GV->getLinkage(), diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index cd30880..0dfbf10 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -24,11 +24,14 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-isel" -static cl::opt<int> -FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); +unsigned FMAContractLevel = 0; + +static cl::opt<unsigned, true> +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::location(FMAContractLevel), + cl::init(2)); static cl::opt<int> UsePrecDivF32( "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, @@ -138,7 +141,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - ResNode = SelectLDGLDUVector(N); + ResNode = SelectLDGLDU(N); break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -164,6 +167,9 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case ISD::INTRINSIC_WO_CHAIN: ResNode = SelectIntrinsicNoChain(N); break; + case ISD::INTRINSIC_W_CHAIN: + ResNode = SelectIntrinsicChain(N); + break; case NVPTXISD::Tex1DFloatI32: case NVPTXISD::Tex1DFloatFloat: case NVPTXISD::Tex1DFloatFloatLevel: @@ -253,6 +259,12 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::Suld3DV4I32Trap: ResNode = SelectSurfaceIntrinsic(N); break; + case ISD::AND: + case ISD::SRA: + case ISD::SRL: + // Try to select BFE + ResNode = SelectBFE(N); + break; case ISD::ADDRSPACECAST: ResNode = SelectAddrSpaceCast(N); break; @@ -264,6 +276,21 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } +SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) { + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + return SelectLDGLDU(N); + } +} + static unsigned int getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) { const Value *Src = N->getMemOperand()->getValue(); @@ -981,22 +1008,101 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { return LD; } -SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { +SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); + SDValue Op1; + MemSDNode *Mem; + bool IsLDG = true; + + // If this is an LDG intrinsic, the address is the third operand. Its its an + // LDG/LDU SD node (from custom vector handling), then its the second operand + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + Op1 = N->getOperand(2); + Mem = cast<MemIntrinsicSDNode>(N); + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + IsLDG = true; + break; + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + IsLDG = false; + break; + } + } else { + Op1 = N->getOperand(1); + Mem = cast<MemSDNode>(N); + } + unsigned Opcode; SDLoc DL(N); SDNode *LD; - MemSDNode *Mem = cast<MemSDNode>(N); SDValue Base, Offset, Addr; - EVT EltVT = Mem->getMemoryVT().getVectorElementType(); + EVT EltVT = Mem->getMemoryVT(); + if (EltVT.isVector()) { + EltVT = EltVT.getVectorElementType(); + } if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1092,6 +1198,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1181,6 +1336,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1276,6 +1480,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1365,6 +1618,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1457,7 +1759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + MemRefs0[0] = Mem->getMemOperand(); cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); return LD; @@ -2959,6 +3261,214 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { return Ret; } +/// SelectBFE - Look for instruction sequences that can be made more efficient +/// by using the 'bfe' (bit-field extract) PTX instruction +SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Len; + SDValue Start; + SDValue Val; + bool IsSigned = false; + + if (N->getOpcode() == ISD::AND) { + // Canonicalize the operands + // We want 'and %val, %mask' + if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { + std::swap(LHS, RHS); + } + + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); + if (!Mask) { + // We need a constant mask on the RHS of the AND + return NULL; + } + + // Extract the mask bits + uint64_t MaskVal = Mask->getZExtValue(); + if (!isMask_64(MaskVal)) { + // We *could* handle shifted masks here, but doing so would require an + // 'and' operation to fix up the low-order bits so we would trade + // shr+and for bfe+and, which has the same throughput + return NULL; + } + + // How many bits are in our mask? + uint64_t NumBits = CountTrailingOnes_64(MaskVal); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + + if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { + // We have a 'srl/and' pair, extract the effective start bit and length + Val = LHS.getNode()->getOperand(0); + Start = LHS.getNode()->getOperand(1); + ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); + if (StartConst) { + uint64_t StartVal = StartConst->getZExtValue(); + // How many "good" bits do we have left? "good" is defined here as bits + // that exist in the original value, not shifted in. + uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal; + if (NumBits > GoodBits) { + // Do not handle the case where bits have been shifted in. In theory + // we could handle this, but the cost is likely higher than just + // emitting the srl/and pair. + return NULL; + } + Start = CurDAG->getTargetConstant(StartVal, MVT::i32); + } else { + // Do not handle the case where the shift amount (can be zero if no srl + // was found) is not constant. We could handle this case, but it would + // require run-time logic that would be more expensive than just + // emitting the srl/and pair. + return NULL; + } + } else { + // Do not handle the case where the LHS of the and is not a shift. While + // it would be trivial to handle this case, it would just transform + // 'and' -> 'bfe', but 'and' has higher-throughput. + return NULL; + } + } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { + if (LHS->getOpcode() == ISD::AND) { + ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); + if (!ShiftCnst) { + // Shift amount must be constant + return NULL; + } + + uint64_t ShiftAmt = ShiftCnst->getZExtValue(); + + SDValue AndLHS = LHS->getOperand(0); + SDValue AndRHS = LHS->getOperand(1); + + // Canonicalize the AND to have the mask on the RHS + if (isa<ConstantSDNode>(AndLHS)) { + std::swap(AndLHS, AndRHS); + } + + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); + if (!MaskCnst) { + // Mask must be constant + return NULL; + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + uint64_t NumZeros; + uint64_t NumBits; + if (isMask_64(MaskVal)) { + NumZeros = 0; + // The number of bits in the result bitfield will be the number of + // trailing ones (the AND) minus the number of bits we shift off + NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt; + } else if (isShiftedMask_64(MaskVal)) { + NumZeros = countTrailingZeros(MaskVal); + unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros); + // The number of bits in the result bitfield will be the number of + // trailing zeros plus the number of set bits in the mask minus the + // number of bits we shift off + NumBits = NumZeros + NumOnes - ShiftAmt; + } else { + // This is not a mask we can handle + return NULL; + } + + if (ShiftAmt < NumZeros) { + // Handling this case would require extra logic that would make this + // transformation non-profitable + return NULL; + } + + Val = AndLHS; + Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + } else if (LHS->getOpcode() == ISD::SHL) { + // Here, we have a pattern like: + // + // (sra (shl val, NN), MM) + // or + // (srl (shl val, NN), MM) + // + // If MM >= NN, we can efficiently optimize this with bfe + Val = LHS->getOperand(0); + + SDValue ShlRHS = LHS->getOperand(1); + ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); + if (!ShlCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); + + SDValue ShrRHS = RHS; + ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); + if (!ShrCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); + + // To avoid extra codegen and be profitable, we need Outer >= Inner + if (OuterShiftAmt < InnerShiftAmt) { + return NULL; + } + + // If the outer shift is more than the type size, we have no bitfield to + // extract (since we also check that the inner shift is <= the outer shift + // then this also implies that the inner shift is < the type size) + if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) { + return NULL; + } + + Start = + CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32); + Len = + CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() - + OuterShiftAmt, MVT::i32); + + if (N->getOpcode() == ISD::SRA) { + // If we have a arithmetic right shift, we need to use the signed bfe + // variant + IsSigned = true; + } + } else { + // No can do... + return NULL; + } + } else { + // No can do... + return NULL; + } + + + unsigned Opc; + // For the BFE operations we form here from "and" and "srl", always use the + // unsigned variants. + if (Val.getValueType() == MVT::i32) { + if (IsSigned) { + Opc = NVPTX::BFE_S32rii; + } else { + Opc = NVPTX::BFE_U32rii; + } + } else if (Val.getValueType() == MVT::i64) { + if (IsSigned) { + Opc = NVPTX::BFE_S64rii; + } else { + Opc = NVPTX::BFE_U64rii; + } + } else { + // We cannot handle this type + return NULL; + } + + SDValue Ops[] = { + Val, Start, Len + }; + + SDNode *Ret = + CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + + return Ret; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 11f92e7..c44ccb2 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -59,10 +59,11 @@ private: SDNode *Select(SDNode *N) override; SDNode *SelectIntrinsicNoChain(SDNode *N); + SDNode *SelectIntrinsicChain(SDNode *N); SDNode *SelectTexSurfHandle(SDNode *N); SDNode *SelectLoad(SDNode *N); SDNode *SelectLoadVector(SDNode *N); - SDNode *SelectLDGLDUVector(SDNode *N); + SDNode *SelectLDGLDU(SDNode *N); SDNode *SelectStore(SDNode *N); SDNode *SelectStoreVector(SDNode *N); SDNode *SelectLoadParam(SDNode *N); @@ -71,6 +72,7 @@ private: SDNode *SelectAddrSpaceCast(SDNode *N); SDNode *SelectTextureIntrinsic(SDNode *N); SDNode *SelectSurfaceIntrinsic(SDNode *N); + SDNode *SelectBFE(SDNode *N); inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index b0943be..cb452ff 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <sstream> @@ -111,6 +112,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Jump is Expensive. Don't create extra control flow for 'and', 'or' // condition branches. @@ -130,7 +132,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); // Operations not directly supported by NVPTX. - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -146,6 +154,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + if (nvptxSubtarget.hasROT64()) { setOperationAction(ISD::ROTL, MVT::i64, Legal); setOperationAction(ISD::ROTR, MVT::i64, Legal); @@ -237,6 +252,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + // Now deduce the information based on the above mentioned // actions computeRegisterProperties(); @@ -328,6 +350,16 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::StoreV2"; case NVPTXISD::StoreV4: return "NVPTXISD::StoreV4"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32"; case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; case NVPTXISD::Tex1DFloatFloatLevel: @@ -441,8 +473,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType() == MVT::i1; +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } SDValue @@ -487,26 +523,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, } else if (isa<PointerType>(retTy)) { O << ".param .b" << getPointerTy().getSizeInBits() << " _"; } else { - if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, retTy, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - // TODO: no need to loop - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } - O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]"; + if((retTy->getTypeID() == Type::StructTyID) || + isa<VectorType>(retTy)) { + O << ".param .align " + << retAlignment + << " .b8 _[" + << getDataLayout()->getTypeAllocSize(retTy) << "]"; } else { assert(false && "Unknown return type"); } @@ -675,7 +697,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Ty->isAggregateType()) { // aggregate SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, Ty, vtparts); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align <align> .b8 .param<n>[<size>]; @@ -687,34 +710,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); - } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - elemtype, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - ++OIdx; + unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + SDValue StVal = OutVals[OIdx]; + if (elemtype.getSizeInBits() < 16) { + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(Offsets[j], MVT::i32), + StVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, CopyParamOps, + elemtype, MachinePointerInfo(), + ArgAlign); + InFlag = Chain.getValue(1); + ++OIdx; } if (vtparts.size() > 0) --OIdx; @@ -899,13 +914,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // struct or vector SmallVector<EVT, 16> vtparts; + SmallVector<uint64_t, 16> Offsets; const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputeValueVTs(*this, PTy->getElementType(), vtparts); + ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); // declare .param .align <align> .b8 .param<n>[<size>]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). @@ -917,38 +934,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + SDValue srcAddr = + DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], + DAG.getConstant(curOffset, getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, + PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], - DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, - 0); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), theVal, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, elemtype, - MachinePointerInfo()); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(curOffset, MVT::i32), theVal, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - } + InFlag = Chain.getValue(1); } ++paramCount; } @@ -1057,7 +1064,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - unsigned resoffset = 0; if (retTy && retTy->isVectorTy()) { EVT ObjectVT = getValueType(retTy); unsigned NumElts = ObjectVT.getVectorNumElements(); @@ -1066,14 +1072,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ObjectVT) == NumElts && "Vector was not scalarized"); unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 16 ? true : false; + bool needTruncate = sz < 8 ? true : false; if (NumElts == 1) { // Just a simple load SmallVector<EVT, 4> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); } else @@ -1097,9 +1104,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (NumElts == 2) { // LoadV2 SmallVector<EVT, 4> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); LoadRetVTs.push_back(MVT::i16); @@ -1142,9 +1150,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); for (unsigned i = 0; i < NumElts; i += VecSize) { SmallVector<EVT, 8> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 for (unsigned j = 0; j < VecSize; ++j) LoadRetVTs.push_back(MVT::i16); @@ -1183,10 +1192,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else { SmallVector<EVT, 16> VTs; - ComputePTXValueVTs(*this, retTy, VTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); + unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); + unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); bool needTruncate = sz < 8 ? true : false; if (VTs[i].isInteger() && (sz < 8)) sz = 8; @@ -1212,19 +1224,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 4> LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); - LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParam, dl, DAG.getVTList(LoadRetVTs), LoadRetOps, - TheLoadType, MachinePointerInfo()); + TheLoadType, MachinePointerInfo(), AlignI); Chain = retval.getValue(1); InFlag = retval.getValue(2); SDValue Ret0 = retval.getValue(0); if (needTruncate) Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); InVals.push_back(Ret0); - resoffset += sz / 8; } } } @@ -1262,6 +1273,127 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); } +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -1282,6 +1414,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -1495,7 +1632,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const Function *F = MF.getFunction(); const AttributeSet &PAL = F->getAttributes(); - const TargetLowering *TLI = nvTM->getTargetLowering(); + const TargetLowering *TLI = DAG.getTarget().getTargetLowering(); SDValue Root = DAG.getRoot(); std::vector<SDValue> OutChains; @@ -1549,8 +1686,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { - EVT partVT = vtparts[parti]; - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT)); + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); ++InsIdx; } if (vtparts.size() > 0) @@ -1866,7 +2002,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Offset = 0; EVT VecVT = - EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize); + EVT::getVectorVT(F->getContext(), EltVT, VecSize); unsigned PerStoreOffset = TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1925,12 +2061,10 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } else { SmallVector<EVT, 16> ValVTs; - // const_cast is necessary since we are still using an LLVM version from - // before the type system re-write. - ComputePTXValueVTs(*this, RetTy, ValVTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - unsigned SizeSoFar = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { SDValue theVal = OutVals[i]; EVT TheValType = theVal.getValueType(); @@ -1954,16 +2088,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, else if (TmpVal.getValueType().getSizeInBits() < 16) TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); - SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal }; + SDValue Ops[] = { + Chain, + DAG.getConstant(Offsets[i], MVT::i32), + TmpVal }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, DAG.getVTList(MVT::Other), Ops, TheStoreType, MachinePointerInfo()); - if(TheValType.isVector()) - SizeSoFar += - TheStoreType.getVectorElementType().getStoreSizeInBits() / 8; - else - SizeSoFar += TheStoreType.getStoreSizeInBits()/8; } } } @@ -2220,22 +2352,62 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: + case Intrinsic::nvvm_ldu_global_p: { Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) Info.memVT = getValueType(I.getType()); - else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(); + else Info.memVT = getValueType(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) + Info.memVT = getValueType(I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(); else - Info.memVT = MVT::f32; + Info.memVT = getValueType(I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; Info.readMem = true; Info.writeMem = false; - Info.align = 0; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + return true; + } case Intrinsic::nvvm_tex_1d_v4f32_i32: case Intrinsic::nvvm_tex_1d_v4f32_f32: @@ -2427,6 +2599,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { switch (Constraint[0]) { default: break; + case 'b': case 'r': case 'h': case 'c': @@ -2446,6 +2619,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); case 'c': return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'h': @@ -2469,6 +2644,406 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { return 4; } +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +extern unsigned FMAContractLevel; + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + if (FMAContractLevel == 0) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, + OptLevel); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa<ConstantSDNode>(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast<MemSDNode>(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != 0) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { + APInt Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + if (Val.isIntN(OptSize)) { + return true; + } + return false; + } else { + if (Val.isSignedIntN(OptSize)) { + return true; + } + return false; + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + if (LHSSign != RHSSign) + return false; + + return true; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa<ConstantSDNode>(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + // FIXME: Get this from the DAG somehow + CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + } + return SDValue(); +} + /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) { diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 7bad8a2..7b4026d 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -16,7 +16,6 @@ #define NVPTXISELLOWERING_H #include "NVPTX.h" -#include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" @@ -50,6 +49,11 @@ enum NodeType { CallSeqBegin, CallSeqEnd, CallPrototype, + FUN_SHFL_CLAMP, + FUN_SHFR_CLAMP, + MUL_WIDE_SIGNED, + MUL_WIDE_UNSIGNED, + IMAD, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -167,6 +171,8 @@ enum NodeType { }; } +class NVPTXSubtarget; + //===--------------------------------------------------------------------===// // TargetLowering Implementation //===--------------------------------------------------------------------===// @@ -196,9 +202,9 @@ public: /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned getFunctionAlignment(const Function *F) const; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override { + EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override { if (VT.isVector()) - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); return MVT::i1; } @@ -236,7 +242,8 @@ public: // PTX always uses 32-bit shift amounts MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } - bool shouldSplitVectorType(EVT VT) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; private: const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here @@ -255,8 +262,12 @@ private: SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, Type *Ty, unsigned Idx) const; diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index 397f4bc..a98fb37 100644 --- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -146,7 +146,7 @@ bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) { void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) { // We implement "poor man's DCE" here to make sure any code that is no longer // live is actually unreachable and can be trivially eliminated by the - // unreachable block elimiation pass. + // unreachable block elimination pass. for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ++UI) { if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) { diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index cdc8088..b5b4fbe 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -29,8 +29,8 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} // FIXME: Add the subtarget support on this constructor. -NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) - : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {} +NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI) + : NVPTXGenInstrInfo(), RegInfo(STI) {} void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h index 88a9e45..2ac2974 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -24,11 +24,10 @@ namespace llvm { class NVPTXInstrInfo : public NVPTXGenInstrInfo { - NVPTXTargetMachine &TM; const NVPTXRegisterInfo RegInfo; virtual void anchor(); public: - explicit NVPTXInstrInfo(NVPTXTargetMachine &TM); + explicit NVPTXInstrInfo(NVPTXSubtarget &STI); const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index fbcd0e4..d2c0373 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -158,9 +158,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">; def true : Predicate<"1">; +def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">; + //===----------------------------------------------------------------------===// // Some Common Instruction Class Templates @@ -461,33 +464,45 @@ def SHL2MUL16 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(temp.shl(v), MVT::i16); }]>; -def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDES64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDES64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDEU64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDEU64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), +def MULWIDES32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDES32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDEU32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, @@ -507,25 +522,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>, + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>, + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>, + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>, + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + + +def SDTMulWide + : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed + : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned + : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, Requires<[doMulWide]>; defm MULT : I3<"mul.lo.s", mul>; @@ -541,69 +594,75 @@ defm SREM : I3<"rem.s", srem>; defm UREM : I3<"rem.u", urem>; // The ri version will not be selected as DAGCombiner::visitUREM will lower it. +def SDTIMAD + : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, + SDTCisInt<2>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def imad + : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b), - imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, imm:$c))]>; def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, imm:$c))]>; def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), imm:$c))]>; - + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, imm:$c))]>; def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", @@ -809,36 +868,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), - Float32Regs:$c))]>, Requires<[Pred]>; - // This is to WAR a weird bug in Tablegen that does not automatically - // generate the following permutated rule rrr2 from the above rrr. - // So we explicitly add it here. This happens to FMA32 only. - // See the comments at FMAD32 and FMA32 for more information. - def rrr2 : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd Float32Regs:$c, - (fmul Float32Regs:$a, Float32Regs:$b)))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>, Requires<[Pred]>; def rir : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } @@ -846,73 +895,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, Float64Regs:$b), - Float64Regs:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>, + Requires<[Pred]>; def rri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a, - Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>, + Requires<[Pred]>; def rir : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } -// Due to a unknown reason (most likely a bug in tablegen), tablegen does not -// automatically generate the rrr2 rule from -// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32. -// If we reverse the order of the following two lines, then rrr2 rule will be -// generated for FMA32, but not for rrr. -// Therefore, we manually write the rrr2 rule in FPCONTRACT32. -defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; -defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; -defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; - -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c -// b*c-a => fmad(b, c, -a) -// - legal because b*c-a <=> b*c+(-a) -multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)), - (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>, - Requires<[Pred]>; - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)), - (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a), - (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>, - Requires<[Pred]>; -} - -defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>; -defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>; -defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>; +defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>; +defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>; +defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>; def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", @@ -1083,6 +1091,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> { defm SRA : RSHIFT_FORMAT<"shr.s", sra>; defm SRL : RSHIFT_FORMAT<"shr.u", srl>; +// +// Rotate: use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// +// Rotate: if ptx shf instruction is not available, then use shift+add +// // 32bit def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), @@ -1100,9 +1145,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{ }]>; def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>; + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>; + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1115,7 +1162,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1128,7 +1176,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; // 64bit def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, @@ -1177,6 +1226,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, !strconcat("}}", ""))))))))), [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; +// BFE - bit-field extract + +multiclass BFE<string TyStr, RegisterClass RC> { + // BFE supports both 32-bit and 64-bit values, but the start and length + // operands are always 32-bit + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +defm BFE_S32 : BFE<"s32", Int32Regs>; +defm BFE_U32 : BFE<"u32", Int32Regs>; +defm BFE_S64 : BFE<"s64", Int64Regs>; +defm BFE_U64 : BFE<"u64", Int64Regs>; //----------------------------------- // General Comparison @@ -1292,6 +1364,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; +// +// Funnnel shift in clamp mode +// +// - SDNodes are created so they can be used in the DAG code, +// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +// +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 5e228fc..0ad3dfa 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1057,12 +1057,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>; @@ -1072,6 +1084,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max", + atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1080,6 +1100,14 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max", + atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_min @@ -1089,12 +1117,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>; @@ -1104,6 +1144,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min", + atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1112,6 +1160,14 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min", + atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_inc atom_dec @@ -1153,6 +1209,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>; @@ -1162,6 +1224,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and", + atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and", + atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and", + atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_or @@ -1171,6 +1241,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>; @@ -1180,6 +1256,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or", atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or", + atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or", + atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>; +defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or", + atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>; // atom_xor @@ -1189,6 +1273,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>; @@ -1198,6 +1288,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor", + atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor", + atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor", + atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_cas @@ -1276,67 +1374,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, // Support for ldu on sm_20 or later //----------------------------------- -def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - // Scalar -// @TODO: Revisit this, Changed imemAny to imem -multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDU_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; } -multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; -} - -defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, - ldu_i8>; -defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_p>; -defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_p>; +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; +defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>; +defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>; +defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1406,65 +1470,40 @@ defm INT_PTX_LDU_G_v4f32_ELE // Support for ldg on sm_35 or later //----------------------------------- -def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - -multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDG_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; -} - -multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; } defm INT_PTX_LDG_GLOBAL_i8 - : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>; + : LDG_G<"u8 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i16 - : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u16 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_i64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; defm INT_PTX_LDG_GLOBAL_f32 - : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f32 \t$result, [$src];", Float32Regs>; defm INT_PTX_LDG_GLOBAL_f64 - : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f64 \t$result, [$src];", Float64Regs>; defm INT_PTX_LDG_GLOBAL_p32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_p64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1689,6 +1728,207 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), [(int_nvvm_compiler_error Int64Regs:$a)]>; +// isspacep + +def ISSPACEP_CONST_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_CONST_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_GLOBAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>; +def ISSPACEP_GLOBAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>; +def ISSPACEP_LOCAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>; +def ISSPACEP_LOCAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>; +def ISSPACEP_SHARED_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>; +def ISSPACEP_SHARED_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>; + + +// Special register reads +def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d), + (ins SpecialRegs:$r), + "mov.b32\t$d, $r;", []>; + +def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; + + +// rotate builtin support + +def ROTATE_B32_HW_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTATE_B32_HW_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), + (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]> ; + +def GET_LO_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def GET_HI_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; + +def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), + (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src))> ; + +// funnel shift, requires >= sm_32 +def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +// HW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt), + (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt), + (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + +// SW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; + + //----------------------------------- // Texture Intrinsics //----------------------------------- diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index 0ee018c..5547649 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -66,7 +66,7 @@ public: const MCAsmLayout *Layout) const override { return false; } - void AddValueSymbols(MCAssembler *) const override {}; + void visitUsedExpr(MCStreamer &Streamer) const override {}; const MCSection *FindAssociatedSection() const override { return nullptr; } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index 7a38a66..3482248 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -46,6 +46,10 @@ foreach i = 0-4 in { def da#i : NVPTXReg<"%da"#i>; } +foreach i = 0-31 in { + def ENVREG#i : NVPTXReg<"%envreg"#i>; +} + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// @@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>; def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, + (sequence "ENVREG%u", 0, 31))>; diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp index 8c7df52..d5cded2 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -25,10 +25,41 @@ using namespace llvm; // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} +static std::string computeDataLayout(bool is64Bit) { + std::string Ret = "e"; + + if (!is64Bit) + Ret += "-p:32:32"; + + Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + + return Ret; +} + +NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + // Provide the default CPU if we don't have one. + if (CPU.empty() && FS.size()) + llvm_unreachable("we are not using FeatureStr"); + TargetName = CPU.empty() ? "sm_20" : CPU; + + ParseSubtargetFeatures(TargetName, FS); + + // Set default to PTX 3.2 (CUDA 5.5) + if (PTXVersion == 0) { + PTXVersion = 32; + } + + return *this; +} + NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) + const std::string &FS, const TargetMachine &TM, + bool is64Bit) : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0), - SmVersion(20) { + SmVersion(20), DL(computeDataLayout(is64Bit)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) { Triple T(TT); @@ -36,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, drvInterface = NVPTX::NVCL; else drvInterface = NVPTX::CUDA; - - // Provide the default CPU if none - std::string defCPU = "sm_20"; - - ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS); - - // Get the TargetName from the FS if available - if (FS.empty() && CPU.empty()) - TargetName = defCPU; - else if (!CPU.empty()) - TargetName = CPU; - else - llvm_unreachable("we are not using FeatureStr"); - - // We default to PTX 3.1, but we cannot just default to it in the initializer - // since the attribute parser checks if the given option is >= the default. - // So if we set ptx31 as the default, the ptx30 attribute would never match. - // Instead, we use 0 as the default and manually set 31 if the default is - // used. - if (PTXVersion == 0) { - PTXVersion = 31; - } } diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index 581e5ed..3ed5747 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -15,6 +15,12 @@ #define NVPTXSUBTARGET_H #include "NVPTX.h" +#include "NVPTXFrameLowering.h" +#include "NVPTXISelLowering.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -35,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned int SmVersion; + const DataLayout DL; // Calculates type size & alignment + NVPTXInstrInfo InstrInfo; + NVPTXTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + + // NVPTX does not have any call stack frame, but need a NVPTX specific + // FrameLowering class because TargetFrameLowering is abstract. + NVPTXFrameLowering FrameLowering; + public: /// This constructor initializes the data members to match that /// of the specified module. /// NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit); + const std::string &FS, const TargetMachine &TM, bool is64Bit); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const NVPTXRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } bool hasBrkPt() const { return SmVersion >= 11; } bool hasAtomRedG32() const { return SmVersion >= 11; } @@ -57,10 +81,12 @@ public: bool hasFMAF32() const { return SmVersion >= 20; } bool hasFMAF64() const { return SmVersion >= 13; } bool hasLDG() const { return SmVersion >= 32; } - bool hasLDU() const { return SmVersion >= 20; } + bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); } bool hasGenericLdSt() const { return SmVersion >= 20; } - inline bool hasHWROT32() const { return false; } - inline bool hasSWROT32() const { return true; } + inline bool hasHWROT32() const { return SmVersion >= 32; } + inline bool hasSWROT32() const { + return ((SmVersion >= 20) && (SmVersion < 32)); + } inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); } inline bool hasROT64() const { return SmVersion >= 20; } @@ -76,6 +102,7 @@ public: unsigned getPTXVersion() const { return PTXVersion; } + NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef FS); }; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 26a4f84..069a1b9 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -66,26 +66,13 @@ extern "C" void LLVMInitializeNVPTXTarget() { *PassRegistry::getPassRegistry()); } -static std::string computeDataLayout(const NVPTXSubtarget &ST) { - std::string Ret = "e"; - - if (!ST.is64Bit()) - Ret += "-p:32:32"; - - Ret += "-i64:64-v16:16-v32:32-n16:32:64"; - - return Ret; -} - -NVPTXTargetMachine::NVPTXTargetMachine( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool is64bit) +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering( - *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } @@ -119,6 +106,7 @@ public: bool addInstSelector() override; bool addPreRegAlloc() override; bool addPostRegAlloc() override; + void addMachineSSAOptimization() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -220,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { printAndVerify("After StackSlotColoring"); } + +void NVPTXPassConfig::addMachineSSAOptimization() { + // Pre-ra tail duplication. + if (addPass(&EarlyTailDuplicateID)) + printAndVerify("After Pre-RegAlloc TailDuplicate"); + + // Optimize PHIs before DCE: removing dead PHI cycles may make more + // instructions dead. + addPass(&OptimizePHIsID); + + // This pass merges large allocas. StackSlotColoring is a different pass + // which merges spill slots. + addPass(&StackColoringID); + + // If the target requests it, assign local variables to stack slots relative + // to one another and simplify frame index references where possible. + addPass(&LocalStackSlotAllocationID); + + // With optimization, dead code should already be eliminated. However + // there is one known exception: lowered code for arguments that are only + // used by tail calls, where the tail calls reuse the incoming stack + // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). + addPass(&DeadMachineInstructionElimID); + printAndVerify("After codegen DCE pass"); + + // Allow targets to insert passes that improve instruction level parallelism, + // like if-conversion. Such passes will typically need dominator trees and + // loop info, just like LICM and CSE below. + if (addILPOpts()) + printAndVerify("After ILP optimizations"); + + addPass(&MachineLICMID); + addPass(&MachineCSEID); + + addPass(&MachineSinkingID); + printAndVerify("After Machine LICM, CSE and Sinking passes"); + + addPass(&PeepholeOptimizerID); + printAndVerify("After codegen peephole optimization pass"); +} diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 2db7c18..a7a1c8f 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -14,13 +14,8 @@ #ifndef NVPTX_TARGETMACHINE_H #define NVPTX_TARGETMACHINE_H -#include "ManagedStringPool.h" -#include "NVPTXFrameLowering.h" -#include "NVPTXISelLowering.h" -#include "NVPTXInstrInfo.h" -#include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" -#include "llvm/IR/DataLayout.h" +#include "ManagedStringPool.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -31,50 +26,37 @@ namespace llvm { /// class NVPTXTargetMachine : public LLVMTargetMachine { NVPTXSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - NVPTXInstrInfo InstrInfo; - NVPTXTargetLowering TLInfo; - TargetSelectionDAGInfo TSInfo; - - // NVPTX does not have any call stack frame, but need a NVPTX specific - // FrameLowering class because TargetFrameLowering is abstract. - NVPTXFrameLowering FrameLowering; // Hold Strings that can be free'd all together with NVPTXTargetMachine ManagedStringPool ManagedStrPool; - //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level, - // bool DisableVerify, MCContext *&OutCtx); - public: NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const NVPTXInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; } - const NVPTXRegisterInfo *getRegisterInfo() const override { - return &(InstrInfo.getRegisterInfo()); + return getSubtargetImpl()->getRegisterInfo(); } - NVPTXTargetLowering *getTargetLowering() const override { - return const_cast<NVPTXTargetLowering *>(&TLInfo); + const NVPTXTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); } const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - //virtual bool addInstSelector(PassManagerBase &PM, - // CodeGenOpt::Level OptLevel); - - //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level); - ManagedStringPool *getManagedStrPool() const { return const_cast<ManagedStringPool *>(&ManagedStrPool); } diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index cb8bd72..a8d6b95 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" @@ -47,17 +48,16 @@ class NVVMReflect : public ModulePass { private: StringMap<int> VarMap; typedef DenseMap<std::string, int>::iterator VarMapIter; - Function *ReflectFunction; public: static char ID; - NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) { + NVVMReflect() : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); } NVVMReflect(const StringMap<int> &Mapping) - : ModulePass(ID), ReflectFunction(nullptr) { + : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); I != E; ++I) { @@ -70,6 +70,8 @@ public: } bool runOnModule(Module &) override; +private: + bool handleFunction(Function *ReflectFunction); void setVarMap(); }; } @@ -120,19 +122,7 @@ void NVVMReflect::setVarMap() { } } -bool NVVMReflect::runOnModule(Module &M) { - if (!NVVMReflectEnabled) - return false; - - setVarMap(); - - ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); - - // If reflect function is not used, then there will be - // no entry in the module. - if (!ReflectFunction) - return false; - +bool NVVMReflect::handleFunction(Function *ReflectFunction) { // Validate _reflect function assert(ReflectFunction->isDeclaration() && "_reflect function should not have a body"); @@ -155,13 +145,15 @@ bool NVVMReflect::runOnModule(Module &M) { "Only one operand expect for _reflect function"); // In cuda, we will have an extra constant-to-generic conversion of // the string. - const Value *conv = Reflect->getArgOperand(0); - assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion"); - const CallInst *ConvCall = cast<CallInst>(conv); - const Value *str = ConvCall->getArgOperand(0); - assert(isa<ConstantExpr>(str) && + const Value *Str = Reflect->getArgOperand(0); + if (isa<CallInst>(Str)) { + // CUDA path + const CallInst *ConvCall = cast<CallInst>(Str); + Str = ConvCall->getArgOperand(0); + } + assert(isa<ConstantExpr>(Str) && "Format of _reflect function not recognized"); - const ConstantExpr *GEP = cast<ConstantExpr>(str); + const ConstantExpr *GEP = cast<ConstantExpr>(Str); const Value *Sym = GEP->getOperand(0); assert(isa<Constant>(Sym) && "Format of _reflect function not recognized"); @@ -195,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) { ToRemove[i]->eraseFromParent(); return true; } + +bool NVVMReflect::runOnModule(Module &M) { + if (!NVVMReflectEnabled) + return false; + + setVarMap(); + + + bool Res = false; + std::string Name; + Type *Tys[1]; + Type *I8Ty = Type::getInt8Ty(M.getContext()); + Function *ReflectFunction; + + // Check for standard overloaded versions of llvm.nvvm.reflect + + for (unsigned i = 0; i != 5; ++i) { + Tys[0] = PointerType::get(I8Ty, i); + Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys); + ReflectFunction = M.getFunction(Name); + if(ReflectFunction != 0) { + Res |= handleFunction(ReflectFunction); + } + } + + ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); + // If reflect function is not used, then there will be + // no entry in the module. + if (ReflectFunction != 0) + Res |= handleFunction(ReflectFunction); + + return Res; +} diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h index 45cc0b8..02c5a94 100644 --- a/lib/Target/NVPTX/cl_common_defines.h +++ b/lib/Target/NVPTX/cl_common_defines.h @@ -1,5 +1,5 @@ -#ifndef __CL_COMMON_DEFINES_H__ -#define __CL_COMMON_DEFINES_H__ +#ifndef CL_COMMON_DEFINES_H +#define CL_COMMON_DEFINES_H // This file includes defines that are common to both kernel code and // the NVPTX back-end. @@ -119,4 +119,4 @@ typedef enum clk_sampler_type { #define CLK_LOCAL_MEM_FENCE (1 << 0) #define CLK_GLOBAL_MEM_FENCE (1 << 1) -#endif // __CL_COMMON_DEFINES_H__ +#endif // CL_COMMON_DEFINES_H |