diff options
Diffstat (limited to 'lib/Target/NVPTX')
-rw-r--r-- | lib/Target/NVPTX/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h | 1 | ||||
-rw-r--r-- | lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp | 10 | ||||
-rw-r--r-- | lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTX.h | 6 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 211 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXAsmPrinter.h | 31 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXGenericToNVVM.cpp | 436 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 59 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 4 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelLowering.cpp | 33 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelLowering.h | 6 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.cpp | 46 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.td | 3 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXIntrinsics.td | 40 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 225 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 4 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXTargetMachine.cpp | 48 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVVMReflect.cpp | 24 |
19 files changed, 955 insertions, 236 deletions
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 7da2fed..a8293da 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -23,6 +23,8 @@ set(NVPTXCodeGen_sources NVPTXAsmPrinter.cpp NVPTXUtilities.cpp NVVMReflect.cpp + NVPTXGenericToNVVM.cpp + NVPTXPrologEpilogPass.cpp ) add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index b3e8b5d..edf4a80 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -22,7 +22,6 @@ namespace llvm { enum AddressSpace { ADDRESS_SPACE_GENERIC = 0, ADDRESS_SPACE_GLOBAL = 1, - ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space ADDRESS_SPACE_SHARED = 3, ADDRESS_SPACE_CONST = 4, ADDRESS_SPACE_LOCAL = 5, diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 459cd96..dfa1ff5 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -17,17 +17,15 @@ using namespace llvm; -bool CompileForDebugging; - // -debug-compile - Command line option to inform opt and llc passes to // compile for debugging -static cl::opt<bool, true> -Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden, - cl::location(CompileForDebugging), cl::init(false)); +static cl::opt<bool> CompileForDebugging("debug-compile", + cl::desc("Compile for debugging"), + cl::Hidden, cl::init(false)); void NVPTXMCAsmInfo::anchor() {} -NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) { +NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) { Triple TheTriple(TT); if (TheTriple.getArch() == Triple::nvptx64) { PointerSize = CalleeSaveStackSlotSize = 8; diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 82097da..7d1633f 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -23,7 +23,7 @@ class StringRef; class NVPTXMCAsmInfo : public MCAsmInfo { virtual void anchor(); public: - explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT); + explicit NVPTXMCAsmInfo(const StringRef &TT); }; } // namespace llvm diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 6a53a44..179dc27 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -16,6 +16,7 @@ #define LLVM_TARGET_NVPTX_H #include "MCTargetDesc/NVPTXBaseInfo.h" +#include "llvm/ADT/StringMap.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/ErrorHandling.h" @@ -26,6 +27,7 @@ namespace llvm { class NVPTXTargetMachine; class FunctionPass; +class MachineFunctionPass; class formatted_raw_ostream; namespace NVPTXCC { @@ -62,6 +64,10 @@ createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &); +ModulePass *createGenericToNVVMPass(); +ModulePass *createNVVMReflectPass(); +ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping); +MachineFunctionPass *createNVPTXPrologEpilogPass(); bool isImageOrSamplerVal(const Value *, const Module *); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ce5d78a..ff73931 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -68,11 +68,12 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, namespace { /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V /// depends. -void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) +void DiscoverDependentGlobals(const Value *V, + DenseSet<const GlobalVariable *> &Globals) { + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) Globals.insert(GV); else { - if (User *U = dyn_cast<User>(V)) { + if (const User *U = dyn_cast<User>(V)) { for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) { DiscoverDependentGlobals(U->getOperand(i), Globals); } @@ -84,8 +85,9 @@ void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) { /// instances to be emitted, but only after any dependents have been added /// first. void VisitGlobalVariableForEmission( - GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order, - DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) { + const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order, + DenseSet<const GlobalVariable *> &Visited, + DenseSet<const GlobalVariable *> &Visiting) { // Have we already visited this one? if (Visited.count(GV)) return; @@ -98,12 +100,12 @@ void VisitGlobalVariableForEmission( Visiting.insert(GV); // Make sure we visit all dependents first - DenseSet<GlobalVariable *> Others; + DenseSet<const GlobalVariable *> Others; for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i) DiscoverDependentGlobals(GV->getOperand(i), Others); - for (DenseSet<GlobalVariable *>::iterator I = Others.begin(), - E = Others.end(); + for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(), + E = Others.end(); I != E; ++I) VisitGlobalVariableForEmission(*I, Order, Visited, Visiting); @@ -405,6 +407,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { SmallString<128> Str; raw_svector_ostream O(Str); + if (!GlobalsEmitted) { + emitGlobals(*MF->getFunction()->getParent()); + GlobalsEmitted = true; + } + // Set up MRI = &MF->getRegInfo(); F = MF->getFunction(); @@ -429,9 +436,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { } void NVPTXAsmPrinter::EmitFunctionBodyStart() { - const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); - unsigned numRegClasses = TRI.getNumRegClasses(); - VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1]; + VRegMapping.clear(); OutStreamer.EmitRawText(StringRef("{\n")); setAndEmitFunctionVirtualRegisters(*MF); @@ -443,7 +448,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() { void NVPTXAsmPrinter::EmitFunctionBodyEnd() { OutStreamer.EmitRawText(StringRef("}\n")); - delete[] VRidGlobal2LocalMap; + VRegMapping.clear(); } void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, @@ -500,9 +505,8 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec, raw_ostream &O) { const TargetRegisterClass *RC = MRI->getRegClass(vr); - unsigned id = RC->getID(); - std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[id]; + DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; unsigned mapped_vr = regmap[vr]; if (!isVec) { @@ -695,7 +699,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) { else O << ".func "; printReturnValStr(F, O); - O << *CurrentFnSym << "\n"; + O << *Mang->getSymbol(F) << "\n"; emitFunctionParamList(F, O); O << ";\n"; } @@ -795,7 +799,7 @@ static bool useFuncSeen(const Constant *C, return false; } -void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { +void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { llvm::DenseMap<const Function *, bool> seenMap; for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { const Function *F = FI; @@ -805,7 +809,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { continue; if (F->getIntrinsicID()) continue; - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); continue; } @@ -817,14 +820,12 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { // The use is in the initialization of a global variable // that is a function pointer, so print a declaration // for the original function - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } // Emit a declaration of this function if the function that // uses this constant expr has already been seen. if (useFuncSeen(C, seenMap)) { - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } @@ -844,7 +845,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { // appearing in the module before the callee. so print out // a declaration for the callee. if (seenMap.find(caller) != seenMap.end()) { - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } @@ -909,7 +909,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { const_cast<TargetLoweringObjectFile &>(getObjFileLowering()) .Initialize(OutContext, TM); - Mang = new Mangler(OutContext, *TM.getDataLayout()); + Mang = new Mangler(OutContext, &TM); // Emit header before any dwarf directives are emitted below. emitHeader(M, OS1); @@ -921,6 +921,12 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) recordAndEmitFilenames(M); + GlobalsEmitted = false; + + return false; // success +} + +void NVPTXAsmPrinter::emitGlobals(const Module &M) { SmallString<128> Str2; raw_svector_ostream OS2(Str2); @@ -931,13 +937,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { // global variable in order, and ensure that we emit it *after* its dependent // globals. We use a little extra memory maintaining both a set and a list to // have fast searches while maintaining a strict ordering. - SmallVector<GlobalVariable *, 8> Globals; - DenseSet<GlobalVariable *> GVVisited; - DenseSet<GlobalVariable *> GVVisiting; + SmallVector<const GlobalVariable *, 8> Globals; + DenseSet<const GlobalVariable *> GVVisited; + DenseSet<const GlobalVariable *> GVVisiting; // Visit each global variable, in order - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; - ++I) + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting); assert(GVVisited.size() == M.getGlobalList().size() && @@ -951,7 +957,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { OS2 << '\n'; OutStreamer.EmitRawText(OS2.str()); - return false; // success } void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) { @@ -989,6 +994,14 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) { } bool NVPTXAsmPrinter::doFinalization(Module &M) { + + // If we did not emit any functions, then the global declarations have not + // yet been emitted. + if (!GlobalsEmitted) { + emitGlobals(M); + GlobalsEmitted = true; + } + // XXX Temproarily remove global variables so that doFinalization() will not // emit them again (global variables are emitted at beginning). @@ -1063,7 +1076,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, } } -void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, +void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, + raw_ostream &O, bool processDemoted) { // Skip meta data @@ -1107,10 +1121,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (llvm::isSampler(*GVar)) { O << ".global .samplerref " << llvm::getSamplerName(*GVar); - Constant *Initializer = NULL; + const Constant *Initializer = NULL; if (GVar->hasInitializer()) Initializer = GVar->getInitializer(); - ConstantInt *CI = NULL; + const ConstantInt *CI = NULL; if (Initializer) CI = dyn_cast<ConstantInt>(Initializer); if (CI) { @@ -1183,7 +1197,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (localDecls.find(demotedFunc) != localDecls.end()) localDecls[demotedFunc].push_back(GVar); else { - std::vector<GlobalVariable *> temp; + std::vector<const GlobalVariable *> temp; temp.push_back(GVar); localDecls[demotedFunc] = temp; } @@ -1199,17 +1213,20 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { O << " ."; - O << getPTXFundamentalTypeStr(ETy, false); + // Special case: ABI requires that we use .u8 for predicates + if (ETy->isIntegerTy(1)) + O << "u8"; + else + O << getPTXFundamentalTypeStr(ETy, false); O << " "; O << *Mang->getSymbol(GVar); // Ptx allows variable initilization only for constant and global state // spaces. if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { - Constant *Initializer = GVar->getInitializer(); + const Constant *Initializer = GVar->getInitializer(); if (!Initializer->isNullValue()) { O << " = "; printScalarConstant(Initializer, O); @@ -1230,10 +1247,9 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, // Ptx allows variable initilization only for constant and // global state spaces. if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { - Constant *Initializer = GVar->getInitializer(); + const Constant *Initializer = GVar->getInitializer(); if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) { AggBuffer aggBuffer(ElementSize, O, *this); bufferAggregateConstant(Initializer, &aggBuffer); @@ -1283,7 +1299,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { if (localDecls.find(f) == localDecls.end()) return; - std::vector<GlobalVariable *> &gvars = localDecls[f]; + std::vector<const GlobalVariable *> &gvars = localDecls[f]; for (unsigned i = 0, e = gvars.size(); i != e; ++i) { O << "\t// demoted variable\n\t"; @@ -1301,14 +1317,6 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace, O << "global"; break; case llvm::ADDRESS_SPACE_CONST: - // This logic should be consistent with that in - // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp) - if (nvptxSubtarget.hasGenericLdSt()) - O << "global"; - else - O << "const"; - break; - case llvm::ADDRESS_SPACE_CONST_NOT_GEN: O << "const"; break; case llvm::ADDRESS_SPACE_SHARED: @@ -1448,7 +1456,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O) { if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) || (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) - O << *CurrentFnSym << "_param_" << paramIndex; + O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex; else { std::string argName = I->getName(); const char *p = argName.c_str(); @@ -1507,11 +1515,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (llvm::isImage(*I)) { std::string sname = I->getName(); if (llvm::isImageWriteOnly(*I)) - O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex; + O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_" + << paramIndex; else // Default image is read_only - O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex; + O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_" + << paramIndex; } else // Should be llvm::isSampler(*I) - O << "\t.param .samplerref " << *CurrentFnSym << "_param_" + O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_" << paramIndex; continue; } @@ -1546,14 +1556,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { default: O << ".ptr "; break; - case llvm::ADDRESS_SPACE_CONST_NOT_GEN: + case llvm::ADDRESS_SPACE_CONST: O << ".ptr .const "; break; case llvm::ADDRESS_SPACE_SHARED: O << ".ptr .shared "; break; case llvm::ADDRESS_SPACE_GLOBAL: - case llvm::ADDRESS_SPACE_CONST: O << ".ptr .global "; break; } @@ -1564,7 +1573,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } // non-pointer scalar to kernel func - O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " "; + O << "\t.param ."; + // Special case: predicate operands become .u8 types + if (Ty->isIntegerTy(1)) + O << "u8"; + else + O << getPTXFundamentalTypeStr(Ty); + O << " "; printParamName(I, paramIndex, O); continue; } @@ -1680,48 +1695,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( for (unsigned i = 0; i < numVRs; i++) { unsigned int vr = TRI->index2VirtReg(i); const TargetRegisterClass *RC = MRI->getRegClass(vr); - std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[RC->getID()]; + DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; int n = regmap.size(); regmap.insert(std::make_pair(vr, n + 1)); } // Emit register declarations // @TODO: Extract out the real register usage - O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; // Emit declaration of the virtual registers or 'physical' registers for // each register class - //for (unsigned i=0; i< numRegClasses; i++) { - // std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[i]; - // const TargetRegisterClass *RC = TRI->getRegClass(i); - // std::string rcname = getNVPTXRegClassName(RC); - // std::string rcStr = getNVPTXRegClassStr(RC); - // //int n = regmap.size(); - // if (!isNVPTXVectorRegClass(RC)) { - // O << "\t.reg " << rcname << " \t" << rcStr << "<" - // << NVPTXNumRegisters << ">;\n"; - // } - - // Only declare those registers that may be used. And do not emit vector - // registers as - // they are all elementized to scalar registers. - //if (n && !isNVPTXVectorRegClass(RC)) { - // if (RegAllocNilUsed) { - // O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1) - // << ">;\n"; - // } - // else { - // O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr) - // << "<" << 32 << ">;\n"; - // } - //} - //} + for (unsigned i=0; i< TRI->getNumRegClasses(); i++) { + const TargetRegisterClass *RC = TRI->getRegClass(i); + DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; + std::string rcname = getNVPTXRegClassName(RC); + std::string rcStr = getNVPTXRegClassStr(RC); + int n = regmap.size(); + + // Only declare those registers that may be used. + if (n) { + O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1) + << ">;\n"; + } + } OutStreamer.EmitRawText(O.str()); } @@ -1751,12 +1754,12 @@ void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) { O << utohexstr(API.getZExtValue()); } -void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { +void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { O << CI->getValue(); return; } - if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) { + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) { printFPConstant(CFP, O); return; } @@ -1764,13 +1767,13 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { O << "0"; return; } - if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { O << *Mang->getSymbol(GVar); return; } - if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - Value *v = Cexpr->stripPointerCasts(); - if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + const Value *v = Cexpr->stripPointerCasts(); + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { O << *Mang->getSymbol(GVar); return; } else { @@ -1781,7 +1784,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { llvm_unreachable("Not scalar type found in printScalarConstant()"); } -void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, +void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer) { const DataLayout *TD = TM.getDataLayout(); @@ -1809,13 +1812,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, ptr = (unsigned char *)&int16; aggBuffer->addBytes(ptr, 2, Bytes); } else if (ETy == Type::getInt32Ty(CPV->getContext())) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { int int32 = (int)(constInt->getZExtValue()); ptr = (unsigned char *)&int32; aggBuffer->addBytes(ptr, 4, Bytes); break; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>( + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>( ConstantFoldConstantExpression(Cexpr, TD))) { int int32 = (int)(constInt->getZExtValue()); ptr = (unsigned char *)&int32; @@ -1831,13 +1834,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } llvm_unreachable("unsupported integer const type"); } else if (ETy == Type::getInt64Ty(CPV->getContext())) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { long long int64 = (long long)(constInt->getZExtValue()); ptr = (unsigned char *)&int64; aggBuffer->addBytes(ptr, 8, Bytes); break; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>( + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>( ConstantFoldConstantExpression(Cexpr, TD))) { long long int64 = (long long)(constInt->getZExtValue()); ptr = (unsigned char *)&int64; @@ -1858,7 +1861,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } case Type::FloatTyID: case Type::DoubleTyID: { - ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); + const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); const Type *Ty = CFP->getType(); if (Ty == Type::getFloatTy(CPV->getContext())) { float float32 = (float) CFP->getValueAPF().convertToFloat(); @@ -1874,10 +1877,10 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, break; } case Type::PointerTyID: { - if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { aggBuffer->addSymbol(GVar); - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - Value *v = Cexpr->stripPointerCasts(); + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + const Value *v = Cexpr->stripPointerCasts(); aggBuffer->addSymbol(v); } unsigned int s = TD->getTypeAllocSize(CPV->getType()); @@ -1906,7 +1909,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } } -void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV, +void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, AggBuffer *aggBuffer) { const DataLayout *TD = TM.getDataLayout(); int Bytes; diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 6dc9fc0..55f2943 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { unsigned char *buffer; // the buffer unsigned numSymbols; // number of symbol addresses SmallVector<unsigned, 4> symbolPosInBuffer; - SmallVector<Value *, 4> Symbols; + SmallVector<const Value *, 4> Symbols; private: unsigned curpos; @@ -128,7 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { } return curpos; } - void addSymbol(Value *GVar) { + void addSymbol(const Value *GVar) { symbolPosInBuffer.push_back(curpos); Symbols.push_back(GVar); numSymbols++; @@ -153,11 +153,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { if (pos) O << ", "; if (pos == nextSymbolPos) { - Value *v = Symbols[nSym]; - if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + const Value *v = Symbols[nSym]; + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { MCSymbol *Name = AP.Mang->getSymbol(GVar); O << *Name; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) { + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) { O << *nvptx::LowerConstant(Cexpr, AP); } else llvm_unreachable("symbol type unknown"); @@ -205,10 +205,12 @@ private: void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; // definition autogenerated. void printInstruction(const MachineInstr *MI, raw_ostream &O); - void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false); + void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, + bool = false); void printParamName(int paramIndex, raw_ostream &O); void printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O); + void emitGlobals(const Module &M); void emitHeader(Module &M, raw_ostream &O); void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const; void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O); @@ -234,12 +236,16 @@ protected: private: std::string CurrentBankselLabelInBasicBlock; + bool GlobalsEmitted; + // This is specific per MachineFunction. const MachineRegisterInfo *MRI; // The contents are specific for each // MachineFunction. But the size of the // array is not. - std::map<unsigned, unsigned> *VRidGlobal2LocalMap; + typedef DenseMap<unsigned, unsigned> VRegMap; + typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap; + VRegRCMap VRegMapping; // cache the subtarget here. const NVPTXSubtarget &nvptxSubtarget; // Build the map between type name and ID based on module's type @@ -247,7 +253,7 @@ private: std::map<const Type *, std::string> TypeNameMap; // List of variables demoted to a function scope. - std::map<const Function *, std::vector<GlobalVariable *> > localDecls; + std::map<const Function *, std::vector<const GlobalVariable *> > localDecls; // To record filename to ID mapping std::map<std::string, unsigned> filenameMap; @@ -256,15 +262,15 @@ private: void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O); void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const; std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const; - void printScalarConstant(Constant *CPV, raw_ostream &O); + void printScalarConstant(const Constant *CPV, raw_ostream &O); void printFPConstant(const ConstantFP *Fp, raw_ostream &O); - void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer); - void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer); + void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer); + void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer); void printOperandProper(const MachineOperand &MO); void emitLinkageDirective(const GlobalValue *V, raw_ostream &O); - void emitDeclarations(Module &, raw_ostream &O); + void emitDeclarations(const Module &, raw_ostream &O); void emitDeclaration(const Function *, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); @@ -277,7 +283,6 @@ public: : AsmPrinter(TM, Streamer), nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) { CurrentBankselLabelInBasicBlock = ""; - VRidGlobal2LocalMap = NULL; reader = NULL; } diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp new file mode 100644 index 0000000..1077c46 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -0,0 +1,436 @@ +//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Convert generic global variables into either .global or .const access based +// on the variable's "constant" qualifier. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" + +#include "llvm/PassManager.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/IRBuilder.h" + +using namespace llvm; + +namespace llvm { +void initializeGenericToNVVMPass(PassRegistry &); +} + +namespace { +class GenericToNVVM : public ModulePass { +public: + static char ID; + + GenericToNVVM() : ModulePass(ID) {} + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + } + +private: + Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV, + IRBuilder<> &Builder); + Value *remapConstant(Module *M, Function *F, Constant *C, + IRBuilder<> &Builder); + Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F, + Constant *C, + IRBuilder<> &Builder); + Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C, + IRBuilder<> &Builder); + void remapNamedMDNode(Module *M, NamedMDNode *N); + MDNode *remapMDNode(Module *M, MDNode *N); + + typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy; + typedef ValueMap<Constant *, Value *> ConstantToValueMapTy; + GVMapTy GVMap; + ConstantToValueMapTy ConstantToValueMap; +}; +} + +char GenericToNVVM::ID = 0; + +ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); } + +INITIALIZE_PASS( + GenericToNVVM, "generic-to-nvvm", + "Ensure that the global variables are in the global address space", false, + false) + +bool GenericToNVVM::runOnModule(Module &M) { + // Create a clone of each global variable that has the default address space. + // The clone is created with the global address space specifier, and the pair + // of original global variable and its clone is placed in the GVMap for later + // use. + + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E;) { + GlobalVariable *GV = I++; + if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && + !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && + !GV->getName().startswith("llvm.")) { + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getType()->getElementType(), GV->isConstant(), + GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL, + "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL); + NewGV->copyAttributesFrom(GV); + GVMap[GV] = NewGV; + } + } + + // Return immediately, if every global variable has a specific address space + // specifier. + if (GVMap.empty()) { + return false; + } + + // Walk through the instructions in function defitinions, and replace any use + // of original global variables in GVMap with a use of the corresponding + // copies in GVMap. If necessary, promote constants to instructions. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (I->isDeclaration()) { + continue; + } + IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg()); + for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE; + ++BBI) { + for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE; + ++II) { + for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) { + Value *Operand = II->getOperand(i); + if (isa<Constant>(Operand)) { + II->setOperand( + i, remapConstant(&M, I, cast<Constant>(Operand), Builder)); + } + } + } + } + ConstantToValueMap.clear(); + } + + // Walk through the metadata section and update the debug information + // associated with the global variables in the default address space. + for (Module::named_metadata_iterator I = M.named_metadata_begin(), + E = M.named_metadata_end(); + I != E; I++) { + remapNamedMDNode(&M, I); + } + + // Walk through the global variable initializers, and replace any use of + // original global variables in GVMap with a use of the corresponding copies + // in GVMap. The copies need to be bitcast to the original global variable + // types, as we cannot use cvta in global variable initializers. + for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) { + GlobalVariable *GV = I->first; + GlobalVariable *NewGV = I->second; + ++I; + Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType()); + // At this point, the remaining uses of GV should be found only in global + // variable initializers, as other uses have been already been removed + // while walking through the instructions in function definitions. + for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end(); + UI != UE;) { + Use &U = (UI++).getUse(); + U.set(BitCastNewGV); + } + std::string Name = GV->getName(); + GV->removeDeadConstantUsers(); + GV->eraseFromParent(); + NewGV->setName(Name); + } + GVMap.clear(); + + return true; +} + +Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F, + GlobalVariable *GV, + IRBuilder<> &Builder) { + PointerType *GVType = GV->getType(); + Value *CVTA = NULL; + + // See if the address space conversion requires the operand to be bitcast + // to i8 addrspace(n)* first. + EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true); + if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) { + // A bitcast to i8 addrspace(n)* on the operand is needed. + LLVMContext &Context = M->getContext(); + unsigned int AddrSpace = GVType->getAddressSpace(); + Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace); + CVTA = Builder.CreateBitCast(GV, DestTy, "cvta"); + // Insert the address space conversion. + Type *ResultType = + PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC); + SmallVector<Type *, 2> ParamTypes; + ParamTypes.push_back(ResultType); + ParamTypes.push_back(DestTy); + Function *CVTAFunction = Intrinsic::getDeclaration( + M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes); + CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta"); + // Another bitcast from i8 * to <the element type of GVType> * is + // required. + DestTy = + PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC); + CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta"); + } else { + // A simple CVTA is enough. + SmallVector<Type *, 2> ParamTypes; + ParamTypes.push_back(PointerType::get(GVType->getElementType(), + llvm::ADDRESS_SPACE_GENERIC)); + ParamTypes.push_back(GVType); + Function *CVTAFunction = Intrinsic::getDeclaration( + M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes); + CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta"); + } + + return CVTA; +} + +Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C, + IRBuilder<> &Builder) { + // If the constant C has been converted already in the given function F, just + // return the converted value. + ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C); + if (CTII != ConstantToValueMap.end()) { + return CTII->second; + } + + Value *NewValue = C; + if (isa<GlobalVariable>(C)) { + // If the constant C is a global variable and is found in GVMap, generate a + // set set of instructions that convert the clone of C with the global + // address space specifier to a generic pointer. + // The constant C cannot be used here, as it will be erased from the + // module eventually. And the clone of C with the global address space + // specifier cannot be used here either, as it will affect the types of + // other instructions in the function. Hence, this address space conversion + // is required. + GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C)); + if (I != GVMap.end()) { + NewValue = getOrInsertCVTA(M, F, I->second, Builder); + } + } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) || + isa<ConstantStruct>(C)) { + // If any element in the constant vector or aggregate C is or uses a global + // variable in GVMap, the constant C needs to be reconstructed, using a set + // of instructions. + NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder); + } else if (isa<ConstantExpr>(C)) { + // If any operand in the constant expression C is or uses a global variable + // in GVMap, the constant expression C needs to be reconstructed, using a + // set of instructions. + NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder); + } + + ConstantToValueMap[C] = NewValue; + return NewValue; +} + +Value *GenericToNVVM::remapConstantVectorOrConstantAggregate( + Module *M, Function *F, Constant *C, IRBuilder<> &Builder) { + bool OperandChanged = false; + SmallVector<Value *, 4> NewOperands; + unsigned NumOperands = C->getNumOperands(); + + // Check if any element is or uses a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = C->getOperand(i); + Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the elements has been modified, return C as it is. + if (!OperandChanged) { + return C; + } + + // If any of the elements has been modified, construct the equivalent + // vector or aggregate value with a set instructions and the converted + // elements. + Value *NewValue = UndefValue::get(C->getType()); + if (isa<ConstantVector>(C)) { + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i); + NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx); + } + } else { + for (unsigned i = 0; i < NumOperands; ++i) { + NewValue = + Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i)); + } + } + + return NewValue; +} + +Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, + IRBuilder<> &Builder) { + bool OperandChanged = false; + SmallVector<Value *, 4> NewOperands; + unsigned NumOperands = C->getNumOperands(); + + // Check if any operand is or uses a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = C->getOperand(i); + Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return C as it is. + if (!OperandChanged) { + return C; + } + + // If any of the operands has been modified, construct the instruction with + // the converted operands. + unsigned Opcode = C->getOpcode(); + switch (Opcode) { + case Instruction::ICmp: + // CompareConstantExpr (icmp) + return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()), + NewOperands[0], NewOperands[1]); + case Instruction::FCmp: + // CompareConstantExpr (fcmp) + assert(false && "Address space conversion should have no effect " + "on float point CompareConstantExpr (fcmp)!"); + return C; + case Instruction::ExtractElement: + // ExtractElementConstantExpr + return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]); + case Instruction::InsertElement: + // InsertElementConstantExpr + return Builder.CreateInsertElement(NewOperands[0], NewOperands[1], + NewOperands[2]); + case Instruction::ShuffleVector: + // ShuffleVector + return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1], + NewOperands[2]); + case Instruction::ExtractValue: + // ExtractValueConstantExpr + return Builder.CreateExtractValue(NewOperands[0], C->getIndices()); + case Instruction::InsertValue: + // InsertValueConstantExpr + return Builder.CreateInsertValue(NewOperands[0], NewOperands[1], + C->getIndices()); + case Instruction::GetElementPtr: + // GetElementPtrConstantExpr + return cast<GEPOperator>(C)->isInBounds() + ? Builder.CreateGEP( + NewOperands[0], + makeArrayRef(&NewOperands[1], NumOperands - 1)) + : Builder.CreateInBoundsGEP( + NewOperands[0], + makeArrayRef(&NewOperands[1], NumOperands - 1)); + case Instruction::Select: + // SelectConstantExpr + return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]); + default: + // BinaryConstantExpr + if (Instruction::isBinaryOp(Opcode)) { + return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()), + NewOperands[0], NewOperands[1]); + } + // UnaryConstantExpr + if (Instruction::isCast(Opcode)) { + return Builder.CreateCast(Instruction::CastOps(C->getOpcode()), + NewOperands[0], C->getType()); + } + assert(false && "GenericToNVVM encountered an unsupported ConstantExpr"); + return C; + } +} + +void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) { + + bool OperandChanged = false; + SmallVector<MDNode *, 16> NewOperands; + unsigned NumOperands = N->getNumOperands(); + + // Check if any operand is or contains a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + MDNode *Operand = N->getOperand(i); + MDNode *NewOperand = remapMDNode(M, Operand); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return immediately. + if (!OperandChanged) { + return; + } + + // Replace the old operands with the new operands. + N->dropAllReferences(); + for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(), + E = NewOperands.end(); + I != E; ++I) { + N->addOperand(*I); + } +} + +MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) { + + bool OperandChanged = false; + SmallVector<Value *, 8> NewOperands; + unsigned NumOperands = N->getNumOperands(); + + // Check if any operand is or contains a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = N->getOperand(i); + Value *NewOperand = Operand; + if (Operand) { + if (isa<GlobalVariable>(Operand)) { + GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand)); + if (I != GVMap.end()) { + NewOperand = I->second; + if (++i < NumOperands) { + NewOperands.push_back(NewOperand); + // Address space of the global variable follows the global variable + // in the global variable debug info (see createGlobalVariable in + // lib/Analysis/DIBuilder.cpp). + NewOperand = + ConstantInt::get(Type::getInt32Ty(M->getContext()), + I->second->getType()->getAddressSpace()); + } + } + } else if (isa<MDNode>(Operand)) { + NewOperand = remapMDNode(M, cast<MDNode>(Operand)); + } + } + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return N as it is. + if (!OperandChanged) { + return N; + } + + // If any of the operands has been modified, create a new MDNode with the new + // operands. + return MDNode::get(M->getContext(), makeArrayRef(NewOperands)); +} diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0f4c8db..ac6dbb9 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -42,6 +42,11 @@ static cl::opt<int> UsePrecDivF32( " IEEE Compliant F32 div.rnd if avaiable."), cl::init(2)); +static cl::opt<bool> +UsePrecSqrtF32("nvptx-prec-sqrtf32", + cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), + cl::init(true)); + /// createNVPTXISelDag - This pass converts a legalized DAG into a /// NVPTX-specific DAG, ready for instruction scheduling. FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, @@ -74,6 +79,8 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, // Decide how to translate f32 div do_DIVF32_PREC = UsePrecDivF32; + // Decide how to translate f32 sqrt + do_SQRTF32_PREC = UsePrecSqrtF32; // sm less than sm_20 does not support div.rnd. Use div.full. if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20()) do_DIVF32_PREC = 1; @@ -120,42 +127,26 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { static unsigned int getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) { const Value *Src = N->getSrcValue(); + if (!Src) - return NVPTX::PTXLdStInstCode::LOCAL; + return NVPTX::PTXLdStInstCode::GENERIC; if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) { switch (PT->getAddressSpace()) { - case llvm::ADDRESS_SPACE_LOCAL: - return NVPTX::PTXLdStInstCode::LOCAL; - case llvm::ADDRESS_SPACE_GLOBAL: - return NVPTX::PTXLdStInstCode::GLOBAL; - case llvm::ADDRESS_SPACE_SHARED: - return NVPTX::PTXLdStInstCode::SHARED; - case llvm::ADDRESS_SPACE_CONST_NOT_GEN: - return NVPTX::PTXLdStInstCode::CONSTANT; - case llvm::ADDRESS_SPACE_GENERIC: - return NVPTX::PTXLdStInstCode::GENERIC; - case llvm::ADDRESS_SPACE_PARAM: - return NVPTX::PTXLdStInstCode::PARAM; - case llvm::ADDRESS_SPACE_CONST: - // If the arch supports generic address space, translate it to GLOBAL - // for correctness. - // If the arch does not support generic address space, then the arch - // does not really support ADDRESS_SPACE_CONST, translate it to - // to CONSTANT for better performance. - if (Subtarget.hasGenericLdSt()) - return NVPTX::PTXLdStInstCode::GLOBAL; - else - return NVPTX::PTXLdStInstCode::CONSTANT; - default: - break; + case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; + case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; + case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; + case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; + case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; + case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; + default: break; } } - return NVPTX::PTXLdStInstCode::LOCAL; + return NVPTX::PTXLdStInstCode::GENERIC; } SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); LoadSDNode *LD = cast<LoadSDNode>(N); EVT LoadedVT = LD->getMemoryVT(); SDNode *NVPTXLD = NULL; @@ -198,7 +189,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { // type is integer // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float MVT ScalarVT = SimpleVT.getScalarType(); - unsigned fromTypeWidth = ScalarVT.getSizeInBits(); + // Read at least 8 bits (predicates are stored as 8-bit values) + unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits()); unsigned int fromType; if ((LD->getExtensionType() == ISD::SEXTLOAD)) fromType = NVPTX::PTXLdStInstCode::Signed; @@ -394,7 +386,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; unsigned Opcode; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDNode *LD; MemSDNode *MemSD = cast<MemSDNode>(N); EVT LoadedVT = MemSD->getMemoryVT(); @@ -423,7 +415,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { // type is integer // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float MVT ScalarVT = SimpleVT.getScalarType(); - unsigned FromTypeWidth = ScalarVT.getSizeInBits(); + // Read at least 8 bits (predicates are stored as 8-bit values) + unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits()); unsigned int FromType; // The last operand holds the original LoadSDNode::getExtensionType() value unsigned ExtensionType = cast<ConstantSDNode>( @@ -775,7 +768,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); unsigned Opcode; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDNode *LD; EVT RetVT = N->getValueType(0); @@ -972,7 +965,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); StoreSDNode *ST = cast<StoreSDNode>(N); EVT StoreVT = ST->getMemoryVT(); SDNode *NVPTXST = NULL; @@ -1207,7 +1200,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; unsigned Opcode; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDNode *ST; EVT EltVT = Op1.getValueType(); MemSDNode *MemSD = cast<MemSDNode>(N); diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 70e8e46..ed16d44 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -41,6 +41,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { // Otherwise, use div.full int do_DIVF32_PREC; + // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ + // is true, then generate the corresponding FTZ version. + bool do_SQRTF32_PREC; + // If true, add .ftz to f32 instructions. // This is only meaningful for sm_20 and later, as the default // is not ftz. diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 6e01a5a..6cc850e 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -275,7 +275,7 @@ bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const { SDValue NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op); @@ -435,7 +435,7 @@ std::string NVPTXTargetLowering::getPrototype( SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; + SDLoc dl = CLI.DL; SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; SmallVector<SDValue, 32> &OutVals = CLI.OutVals; SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; @@ -449,8 +449,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isABI = (nvptxSubtarget.getSmVersion() >= 20); SDValue tempChain = Chain; - Chain = - DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true)); + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(uniqueCallSite, true), + dl); SDValue InFlag = Chain.getValue(1); assert((Outs.size() == Args.size()) && @@ -795,7 +796,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true), DAG.getIntPtrConstant(uniqueCallSite + 1, true), - InFlag); + InFlag, dl); uniqueCallSite++; // set isTailCall to false for now, until we figure out how to express @@ -810,7 +811,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); SmallVector<SDValue, 8> Ops; unsigned NumOperands = Node->getNumOperands(); for (unsigned i = 0; i < NumOperands; ++i) { @@ -866,7 +867,7 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); LoadSDNode *LD = cast<LoadSDNode>(Node); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); assert(LD->getExtensionType() == ISD::NON_EXTLOAD); assert(Node->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); @@ -896,7 +897,7 @@ SDValue NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { SDNode *N = Op.getNode(); SDValue Val = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT ValVT = Val.getValueType(); if (ValVT.isVector()) { @@ -985,7 +986,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { // st i8, addr SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); StoreSDNode *ST = cast<StoreSDNode>(Node); SDValue Tmp1 = ST->getChain(); SDValue Tmp2 = ST->getBasePtr(); @@ -1046,7 +1047,7 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); const DataLayout *TD = getDataLayout(); @@ -1145,14 +1146,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( false, TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); if (p.getNode()) - DAG.AssignOrdering(p.getNode(), idx + 1); + p.getNode()->setIROrder(idx + 1); InVals.push_back(p); } else { // If no ABI, just move the param symbol SDValue Arg = getParamSymbol(DAG, idx, ObjectVT); SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) - DAG.AssignOrdering(p.getNode(), idx + 1); + p.getNode()->setIROrder(idx + 1); InVals.push_back(p); } continue; @@ -1169,7 +1170,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) - DAG.AssignOrdering(p.getNode(), idx + 1); + p.getNode()->setIROrder(idx + 1); if (isKernel) InVals.push_back(p); else { @@ -1240,7 +1241,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue NVPTXTargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, + const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const { bool isABI = (nvptxSubtarget.getSmVersion() >= 20); @@ -1450,7 +1451,7 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) { EVT ResVT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); assert(ResVT.isVector() && "Vector load must have vector type"); @@ -1543,7 +1544,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) { SDValue Chain = N->getOperand(0); SDValue Intrin = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Get the intrinsic ID unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 3cd49d3..d3ed63a 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -100,7 +100,7 @@ public: /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; - virtual EVT getSetCCResultType(EVT VT) const { + virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const { if (VT.isVector()) return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); return MVT::i1; @@ -112,7 +112,7 @@ public: virtual SDValue LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue @@ -125,7 +125,7 @@ public: virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, + const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const; virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 33a63c2..52be287 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -32,36 +32,36 @@ NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { - if (NVPTX::Int32RegsRegClass.contains(DestReg) && - NVPTX::Int32RegsRegClass.contains(SrcReg)) + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + + if (DestRC != SrcRC) + report_fatal_error("Attempted to created cross-class register copy"); + + if (DestRC == &NVPTX::Int32RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int8RegsRegClass.contains(DestReg) && - NVPTX::Int8RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int1RegsRegClass.contains(DestReg) && - NVPTX::Int1RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int1RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Float32RegsRegClass.contains(DestReg) && - NVPTX::Float32RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Float32RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int16RegsRegClass.contains(DestReg) && - NVPTX::Int16RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int16RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int64RegsRegClass.contains(DestReg) && - NVPTX::Int64RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int8RegsRegClass) + BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int64RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Float64RegsRegClass.contains(DestReg) && - NVPTX::Float64RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Float64RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + .addReg(SrcReg, getKillRegState(KillSrc)); else { - llvm_unreachable("Don't know how to copy a register"); + llvm_unreachable("Bad register copy"); } } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index f43abe2..da6dd39 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -75,6 +75,9 @@ def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">; def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">; def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">; +def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">; +def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">; + def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; def true : Predicate<"1">; diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 49e2568..24037ca 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -512,6 +512,16 @@ def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs, def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_sqrt_rp_d>; +// nvvm_sqrt intrinsic +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>; + // // Rsqrt // @@ -1510,38 +1520,12 @@ multiclass G_TO_NG<string Str, Intrinsic Intrin> { defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>; defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>; defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>; +defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>; defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>; defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>; defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>; - -def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>; -def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>; - - - -// @TODO: Revisit this. There is a type -// contradiction between iPTRAny and iPTR for the def. -/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen - (Wrapper tglobaladdr:$src)))]>; -def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen - (Wrapper tglobaladdr:$src)))]>;*/ - - -def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>; -def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>; +defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>; // nvvm.ptr.gen.to.param diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp new file mode 100644 index 0000000..843ebed --- /dev/null +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -0,0 +1,225 @@ +//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified +// to remove unneeded functionality and to handle virtual registers. Most code +// here is a copy of PrologEpilogInserter.cpp. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/Pass.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +class NVPTXPrologEpilogPass : public MachineFunctionPass { +public: + static char ID; + NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + +private: + void calculateFrameObjectOffsets(MachineFunction &Fn); +}; +} + +MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() { + return new NVPTXPrologEpilogPass(); +} + +char NVPTXPrologEpilogPass::ID = 0; + +bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); + bool Modified = false; + + calculateFrameObjectOffsets(MF); + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + MachineInstr *MI = I; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (!MI->getOperand(i).isFI()) + continue; + TRI.eliminateFrameIndex(MI, 0, i, NULL); + Modified = true; + } + } + } + + // Add function prolog/epilog + TFI.emitPrologue(MF); + + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + // If last instruction is a return instruction, add an epilogue + if (!I->empty() && I->back().isReturn()) + TFI.emitEpilogue(MF, *I); + } + + return Modified; +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI->getObjectSize(FrameIdx); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, Offset); + Offset += MFI->getObjectSize(FrameIdx); + } +} + +void +NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI->getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // NOTE: We do not have a call stack + + unsigned MaxAlign = MFI->getMaxAlignment(); + + // No scavenger + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI->getUseLocalStackAllocationBlock()) { + unsigned Align = MFI->getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) { + std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI->setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI->getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // No stack protector + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + } + + // No scavenger + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI->getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI->setStackSize(StackSize); +} diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 2824653..bb039f8 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -57,9 +57,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; } if (RC == &NVPTX::Float64RegsRegClass) { - return "%fd"; + return "%fl"; } else if (RC == &NVPTX::Int64RegsRegClass) { - return "%rd"; + return "%rl"; } else if (RC == &NVPTX::Int32RegsRegClass) { return "%r"; } else if (RC == &NVPTX::Int16RegsRegClass) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 67ca6b5..72afe8d 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -49,6 +49,7 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry&); +void initializeGenericToNVVMPass(PassRegistry&); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -62,6 +63,7 @@ extern "C" void LLVMInitializeNVPTXTarget() { // FIXME: This pass is really intended to be invoked during IR optimization, // but it's very NVPTX-specific. initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); + initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); } NVPTXTargetMachine::NVPTXTargetMachine( @@ -72,7 +74,9 @@ NVPTXTargetMachine::NVPTXTargetMachine( Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()), InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering( - *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {} + *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { + initAsmInfo(); +} void NVPTXTargetMachine32::anchor() {} @@ -90,7 +94,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64( CodeGenOpt::Level OL) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} -namespace llvm { +namespace { class NVPTXPassConfig : public TargetPassConfig { public: NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) @@ -100,16 +104,36 @@ public: return getTM<NVPTXTargetMachine>(); } + virtual void addIRPasses(); virtual bool addInstSelector(); virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + + virtual FunctionPass *createTargetRegisterAllocator(bool) LLVM_OVERRIDE; + virtual void addFastRegAlloc(FunctionPass *RegAllocPass); + virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass); }; -} +} // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM); return PassConfig; } +void NVPTXPassConfig::addIRPasses() { + // The following passes are known to not play well with virtual regs hanging + // around after register allocation (which in our case, is *all* registers). + // We explicitly disable them here. We do, however, need some functionality + // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the + // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). + disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineCopyPropagationID); + disablePass(&BranchFolderPassID); + + TargetPassConfig::addIRPasses(); + addPass(createGenericToNVVMPass()); +} + bool NVPTXPassConfig::addInstSelector() { addPass(createLowerAggrCopies()); addPass(createSplitBBatBarPass()); @@ -119,3 +143,21 @@ bool NVPTXPassConfig::addInstSelector() { } bool NVPTXPassConfig::addPreRegAlloc() { return false; } +bool NVPTXPassConfig::addPostRegAlloc() { + addPass(createNVPTXPrologEpilogPass()); + return false; +} + +FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { + return 0; // No reg alloc +} + +void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + assert(!RegAllocPass && "NVPTX uses no regalloc!"); + addPass(&StrongPHIEliminationID); +} + +void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + assert(!RegAllocPass && "NVPTX uses no regalloc!"); + addPass(&StrongPHIEliminationID); +} diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index 0ad62ce..3cc324b 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -14,6 +14,7 @@ // //===----------------------------------------------------------------------===// +#include "NVPTX.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" @@ -40,7 +41,7 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry &); } namespace { -class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass { +class NVVMReflect : public ModulePass { private: StringMap<int> VarMap; typedef DenseMap<std::string, int>::iterator VarMapIter; @@ -48,9 +49,18 @@ private: public: static char ID; - NVVMReflect() : ModulePass(ID) { + NVVMReflect() : ModulePass(ID), ReflectFunction(0) { + initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); - ReflectFunction = 0; + } + + NVVMReflect(const StringMap<int> &Mapping) + : ModulePass(ID), ReflectFunction(0) { + initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); + for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); + I != E; ++I) { + VarMap[(*I).getKey()] = (*I).getValue(); + } } void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } @@ -60,6 +70,14 @@ public: }; } +ModulePass *llvm::createNVVMReflectPass() { + return new NVVMReflect(); +} + +ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) { + return new NVVMReflect(Mapping); +} + static cl::opt<bool> NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::desc("NVVM reflection, enabled by default")); |