aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/NVPTX')
-rw-r--r--lib/Target/NVPTX/NVPTX.td7
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp72
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp24
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.h8
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp532
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.h4
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp797
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.h19
-rw-r--r--lib/Target/NVPTX/NVPTXImageOptimizer.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h3
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td348
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td418
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.h2
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.td7
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.cpp57
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.h35
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp66
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.h42
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp69
-rw-r--r--lib/Target/NVPTX/cl_common_defines.h6
22 files changed, 2031 insertions, 493 deletions
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index d78b4e8..93fabf6 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
"Target SM 3.0">;
def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
"Target SM 3.5">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+ "Target SM 5.0">;
// PTX Versions
def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
"Use PTX version 3.0">;
def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
"Use PTX version 3.1">;
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+ "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+ "Use PTX version 4.0">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -52,6 +58,7 @@ def : Proc<"sm_20", [SM20]>;
def : Proc<"sm_21", [SM21]>;
def : Proc<"sm_30", [SM30]>;
def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_50", [SM50]>;
def NVPTXInstrInfo : InstrInfo {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 4ec575f..decf02a 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -734,23 +734,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
<< " func_retval0";
} else {
if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
- SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*TLI, Ty, vtparts);
- unsigned totalsz = 0;
- for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
- unsigned elems = 1;
- EVT elemtype = vtparts[i];
- if (vtparts[i].isVector()) {
- elems = vtparts[i].getVectorNumElements();
- elemtype = vtparts[i].getVectorElementType();
- }
- for (unsigned j = 0, je = elems; j != je; ++j) {
- unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 8))
- sz = 8;
- totalsz += sz / 8;
- }
- }
+ unsigned totalsz = TD->getTypeAllocSize(Ty);
unsigned retAlignment = 0;
if (!llvm::getAlign(*F, 0, retAlignment))
retAlignment = TD->getABITypeAlignment(Ty);
@@ -1321,6 +1305,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
// external global variable with init -> .visible
// external without init -> .extern
// appending -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit -> .weak.
void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
raw_ostream &O) {
@@ -1346,6 +1334,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
msg.append(V->getName().str());
msg.append("has unsupported appending linkage type");
llvm_unreachable(msg.c_str());
+ } else if (!V->hasInternalLinkage() &&
+ !V->hasPrivateLinkage()) {
+ O << ".weak ";
}
}
}
@@ -1356,10 +1347,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
// Skip meta data
if (GVar->hasSection()) {
- if (GVar->getSection() == "llvm.metadata")
+ if (GVar->getSection() == StringRef("llvm.metadata"))
return;
}
+ // Skip LLVM intrinsic global variables
+ if (GVar->getName().startswith("llvm.") ||
+ GVar->getName().startswith("nvvm."))
+ return;
+
const DataLayout *TD = TM.getDataLayout();
// GlobalVariables are always constant pointers themselves.
@@ -1371,6 +1367,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
O << ".visible ";
else
O << ".extern ";
+ } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+ GVar->hasAvailableExternallyLinkage() ||
+ GVar->hasCommonLinkage()) {
+ O << ".weak ";
}
if (llvm::isTexture(*GVar)) {
@@ -1438,7 +1438,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
O << "linear";
break;
case 2:
- assert(0 && "Anisotropic filtering is not supported");
+ llvm_unreachable("Anisotropic filtering is not supported");
default:
O << "nearest";
break;
@@ -1480,6 +1480,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
O << ".";
emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+ if (isManaged(*GVar)) {
+ O << " .attribute(.managed)";
+ }
+
if (GVar->getAlignment() == 0)
O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
else
@@ -1497,13 +1502,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
// Ptx allows variable initilization only for constant and global state
// spaces.
- if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
- (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
- GVar->hasInitializer()) {
- const Constant *Initializer = GVar->getInitializer();
- if (!Initializer->isNullValue()) {
- O << " = ";
- printScalarConstant(Initializer, O);
+ if (GVar->hasInitializer()) {
+ if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+ const Constant *Initializer = GVar->getInitializer();
+ // 'undef' is treated as there is no value spefied.
+ if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+ O << " = ";
+ printScalarConstant(Initializer, O);
+ }
+ } else {
+ // The frontend adds zero-initializer to variables that don't have an
+ // initial value, so skip warning for this case.
+ if (!GVar->getInitializer()->isNullValue()) {
+ std::string warnMsg = "initial value of '" + GVar->getName().str() +
+ "' is not allowed in addrspace(" +
+ llvm::utostr_32(PTy->getAddressSpace()) + ")";
+ report_fatal_error(warnMsg.c_str());
+ }
}
}
} else {
@@ -1562,7 +1578,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
}
break;
default:
- assert(0 && "type not supported yet");
+ llvm_unreachable("type not supported yet");
}
}
@@ -1682,7 +1698,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
O << "]";
break;
default:
- assert(0 && "type not supported yet");
+ llvm_unreachable("type not supported yet");
}
return;
}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 9030584..8b088412 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -26,6 +26,10 @@
using namespace llvm;
+NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+ is64bit(STI.is64Bit()) {}
+
bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
// cvta.local %SP, %SPL;
if (is64bit) {
unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
- MachineInstr *MI = BuildMI(
- MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
- NVPTX::VRFrame).addReg(LocalReg);
- BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+ NVPTX::VRFrame).addReg(LocalReg);
+ BuildMI(MBB, MI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
LocalReg).addImm(MF.getFunctionNumber());
} else {
unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
- MachineInstr *MI = BuildMI(
- MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
- NVPTX::VRFrame).addReg(LocalReg);
- BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
+ NVPTX::VRFrame).addReg(LocalReg);
+ BuildMI(MBB, MI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
LocalReg).addImm(MF.getFunctionNumber());
}
}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 2ae6d72..56fb673 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -17,16 +17,12 @@
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
-class NVPTXTargetMachine;
-
+class NVPTXSubtarget;
class NVPTXFrameLowering : public TargetFrameLowering {
- NVPTXTargetMachine &tm;
bool is64bit;
public:
- explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
- : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm),
- is64bit(_is64bit) {}
+ explicit NVPTXFrameLowering(NVPTXSubtarget &STI);
bool hasFP(const MachineFunction &MF) const override;
void emitPrologue(MachineFunction &MF) const override;
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 023dd5e..faa9fdb 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -84,7 +84,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
GlobalVariable *GV = I++;
if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
!llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
- !GV->getName().startswith("llvm.")) {
+ !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
GlobalVariable *NewGV = new GlobalVariable(
M, GV->getType()->getElementType(), GV->isConstant(),
GV->getLinkage(),
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index cd30880..0dfbf10 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -24,11 +24,14 @@ using namespace llvm;
#define DEBUG_TYPE "nvptx-isel"
-static cl::opt<int>
-FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
- " 1: do it 2: do it aggressively"),
- cl::init(2));
+unsigned FMAContractLevel = 0;
+
+static cl::opt<unsigned, true>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+ " 1: do it 2: do it aggressively"),
+ cl::location(FMAContractLevel),
+ cl::init(2));
static cl::opt<int> UsePrecDivF32(
"nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
@@ -138,7 +141,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::LDGV4:
case NVPTXISD::LDUV2:
case NVPTXISD::LDUV4:
- ResNode = SelectLDGLDUVector(N);
+ ResNode = SelectLDGLDU(N);
break;
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
@@ -164,6 +167,9 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
case ISD::INTRINSIC_WO_CHAIN:
ResNode = SelectIntrinsicNoChain(N);
break;
+ case ISD::INTRINSIC_W_CHAIN:
+ ResNode = SelectIntrinsicChain(N);
+ break;
case NVPTXISD::Tex1DFloatI32:
case NVPTXISD::Tex1DFloatFloat:
case NVPTXISD::Tex1DFloatFloatLevel:
@@ -253,6 +259,12 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
case NVPTXISD::Suld3DV4I32Trap:
ResNode = SelectSurfaceIntrinsic(N);
break;
+ case ISD::AND:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Try to select BFE
+ ResNode = SelectBFE(N);
+ break;
case ISD::ADDRSPACECAST:
ResNode = SelectAddrSpaceCast(N);
break;
@@ -264,6 +276,21 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
return SelectCode(N);
}
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return NULL;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ return SelectLDGLDU(N);
+ }
+}
+
static unsigned int getCodeAddrSpace(MemSDNode *N,
const NVPTXSubtarget &Subtarget) {
const Value *Src = N->getMemOperand()->getValue();
@@ -981,22 +1008,101 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
return LD;
}
-SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
SDValue Chain = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
+ SDValue Op1;
+ MemSDNode *Mem;
+ bool IsLDG = true;
+
+ // If this is an LDG intrinsic, the address is the third operand. Its its an
+ // LDG/LDU SD node (from custom vector handling), then its the second operand
+ if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ Op1 = N->getOperand(2);
+ Mem = cast<MemIntrinsicSDNode>(N);
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return NULL;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ IsLDG = true;
+ break;
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ IsLDG = false;
+ break;
+ }
+ } else {
+ Op1 = N->getOperand(1);
+ Mem = cast<MemSDNode>(N);
+ }
+
unsigned Opcode;
SDLoc DL(N);
SDNode *LD;
- MemSDNode *Mem = cast<MemSDNode>(N);
SDValue Base, Offset, Addr;
- EVT EltVT = Mem->getMemoryVT().getVectorElementType();
+ EVT EltVT = Mem->getMemoryVT();
+ if (EltVT.isVector()) {
+ EltVT = EltVT.getVectorElementType();
+ }
if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+ break;
+ }
+ }
+ break;
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1092,6 +1198,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+ break;
+ }
+ }
+ break;
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1181,6 +1336,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+ break;
+ }
+ }
+ break;
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1276,6 +1480,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+ break;
+ }
+ }
+ break;
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1365,6 +1618,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+ break;
+ }
+ }
+ break;
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1457,7 +1759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
}
MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
- MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ MemRefs0[0] = Mem->getMemOperand();
cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
return LD;
@@ -2959,6 +3261,214 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
return Ret;
}
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue Len;
+ SDValue Start;
+ SDValue Val;
+ bool IsSigned = false;
+
+ if (N->getOpcode() == ISD::AND) {
+ // Canonicalize the operands
+ // We want 'and %val, %mask'
+ if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+ std::swap(LHS, RHS);
+ }
+
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+ if (!Mask) {
+ // We need a constant mask on the RHS of the AND
+ return NULL;
+ }
+
+ // Extract the mask bits
+ uint64_t MaskVal = Mask->getZExtValue();
+ if (!isMask_64(MaskVal)) {
+ // We *could* handle shifted masks here, but doing so would require an
+ // 'and' operation to fix up the low-order bits so we would trade
+ // shr+and for bfe+and, which has the same throughput
+ return NULL;
+ }
+
+ // How many bits are in our mask?
+ uint64_t NumBits = CountTrailingOnes_64(MaskVal);
+ Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+
+ if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+ // We have a 'srl/and' pair, extract the effective start bit and length
+ Val = LHS.getNode()->getOperand(0);
+ Start = LHS.getNode()->getOperand(1);
+ ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+ if (StartConst) {
+ uint64_t StartVal = StartConst->getZExtValue();
+ // How many "good" bits do we have left? "good" is defined here as bits
+ // that exist in the original value, not shifted in.
+ uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal;
+ if (NumBits > GoodBits) {
+ // Do not handle the case where bits have been shifted in. In theory
+ // we could handle this, but the cost is likely higher than just
+ // emitting the srl/and pair.
+ return NULL;
+ }
+ Start = CurDAG->getTargetConstant(StartVal, MVT::i32);
+ } else {
+ // Do not handle the case where the shift amount (can be zero if no srl
+ // was found) is not constant. We could handle this case, but it would
+ // require run-time logic that would be more expensive than just
+ // emitting the srl/and pair.
+ return NULL;
+ }
+ } else {
+ // Do not handle the case where the LHS of the and is not a shift. While
+ // it would be trivial to handle this case, it would just transform
+ // 'and' -> 'bfe', but 'and' has higher-throughput.
+ return NULL;
+ }
+ } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+ if (LHS->getOpcode() == ISD::AND) {
+ ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShiftCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+
+ uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+ SDValue AndLHS = LHS->getOperand(0);
+ SDValue AndRHS = LHS->getOperand(1);
+
+ // Canonicalize the AND to have the mask on the RHS
+ if (isa<ConstantSDNode>(AndLHS)) {
+ std::swap(AndLHS, AndRHS);
+ }
+
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+ if (!MaskCnst) {
+ // Mask must be constant
+ return NULL;
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ uint64_t NumZeros;
+ uint64_t NumBits;
+ if (isMask_64(MaskVal)) {
+ NumZeros = 0;
+ // The number of bits in the result bitfield will be the number of
+ // trailing ones (the AND) minus the number of bits we shift off
+ NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
+ } else if (isShiftedMask_64(MaskVal)) {
+ NumZeros = countTrailingZeros(MaskVal);
+ unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
+ // The number of bits in the result bitfield will be the number of
+ // trailing zeros plus the number of set bits in the mask minus the
+ // number of bits we shift off
+ NumBits = NumZeros + NumOnes - ShiftAmt;
+ } else {
+ // This is not a mask we can handle
+ return NULL;
+ }
+
+ if (ShiftAmt < NumZeros) {
+ // Handling this case would require extra logic that would make this
+ // transformation non-profitable
+ return NULL;
+ }
+
+ Val = AndLHS;
+ Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32);
+ Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+ } else if (LHS->getOpcode() == ISD::SHL) {
+ // Here, we have a pattern like:
+ //
+ // (sra (shl val, NN), MM)
+ // or
+ // (srl (shl val, NN), MM)
+ //
+ // If MM >= NN, we can efficiently optimize this with bfe
+ Val = LHS->getOperand(0);
+
+ SDValue ShlRHS = LHS->getOperand(1);
+ ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+ if (!ShlCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+ uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+ SDValue ShrRHS = RHS;
+ ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+ if (!ShrCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+ uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+ // To avoid extra codegen and be profitable, we need Outer >= Inner
+ if (OuterShiftAmt < InnerShiftAmt) {
+ return NULL;
+ }
+
+ // If the outer shift is more than the type size, we have no bitfield to
+ // extract (since we also check that the inner shift is <= the outer shift
+ // then this also implies that the inner shift is < the type size)
+ if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
+ return NULL;
+ }
+
+ Start =
+ CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32);
+ Len =
+ CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() -
+ OuterShiftAmt, MVT::i32);
+
+ if (N->getOpcode() == ISD::SRA) {
+ // If we have a arithmetic right shift, we need to use the signed bfe
+ // variant
+ IsSigned = true;
+ }
+ } else {
+ // No can do...
+ return NULL;
+ }
+ } else {
+ // No can do...
+ return NULL;
+ }
+
+
+ unsigned Opc;
+ // For the BFE operations we form here from "and" and "srl", always use the
+ // unsigned variants.
+ if (Val.getValueType() == MVT::i32) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S32rii;
+ } else {
+ Opc = NVPTX::BFE_U32rii;
+ }
+ } else if (Val.getValueType() == MVT::i64) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S64rii;
+ } else {
+ Opc = NVPTX::BFE_U64rii;
+ }
+ } else {
+ // We cannot handle this type
+ return NULL;
+ }
+
+ SDValue Ops[] = {
+ Val, Start, Len
+ };
+
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+
+ return Ret;
+}
+
// SelectDirectAddr - Match a direct address for DAG.
// A direct address could be a globaladdress or externalsymbol.
bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 11f92e7..c44ccb2 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -59,10 +59,11 @@ private:
SDNode *Select(SDNode *N) override;
SDNode *SelectIntrinsicNoChain(SDNode *N);
+ SDNode *SelectIntrinsicChain(SDNode *N);
SDNode *SelectTexSurfHandle(SDNode *N);
SDNode *SelectLoad(SDNode *N);
SDNode *SelectLoadVector(SDNode *N);
- SDNode *SelectLDGLDUVector(SDNode *N);
+ SDNode *SelectLDGLDU(SDNode *N);
SDNode *SelectStore(SDNode *N);
SDNode *SelectStoreVector(SDNode *N);
SDNode *SelectLoadParam(SDNode *N);
@@ -71,6 +72,7 @@ private:
SDNode *SelectAddrSpaceCast(SDNode *N);
SDNode *SelectTextureIntrinsic(SDNode *N);
SDNode *SelectSurfaceIntrinsic(SDNode *N);
+ SDNode *SelectBFE(SDNode *N);
inline SDValue getI32Imm(unsigned Imm) {
return CurDAG->getTargetConstant(Imm, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b0943be..cb452ff 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <sstream>
@@ -111,6 +112,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Jump is Expensive. Don't create extra control flow for 'and', 'or'
// condition branches.
@@ -130,7 +132,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
// Operations not directly supported by NVPTX.
- setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -146,6 +154,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
+
if (nvptxSubtarget.hasROT64()) {
setOperationAction(ISD::ROTL, MVT::i64, Legal);
setOperationAction(ISD::ROTR, MVT::i64, Legal);
@@ -237,6 +252,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+ // We have some custom DAG combine patterns for these nodes
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SHL);
+
// Now deduce the information based on the above mentioned
// actions
computeRegisterProperties();
@@ -328,6 +350,16 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "NVPTXISD::StoreV2";
case NVPTXISD::StoreV4:
return "NVPTXISD::StoreV4";
+ case NVPTXISD::FUN_SHFL_CLAMP:
+ return "NVPTXISD::FUN_SHFL_CLAMP";
+ case NVPTXISD::FUN_SHFR_CLAMP:
+ return "NVPTXISD::FUN_SHFR_CLAMP";
+ case NVPTXISD::IMAD:
+ return "NVPTXISD::IMAD";
+ case NVPTXISD::MUL_WIDE_SIGNED:
+ return "NVPTXISD::MUL_WIDE_SIGNED";
+ case NVPTXISD::MUL_WIDE_UNSIGNED:
+ return "NVPTXISD::MUL_WIDE_UNSIGNED";
case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32";
case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
case NVPTXISD::Tex1DFloatFloatLevel:
@@ -441,8 +473,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
}
}
-bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const {
- return VT.getScalarType() == MVT::i1;
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+ return TypeSplitVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
}
SDValue
@@ -487,26 +523,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
} else if (isa<PointerType>(retTy)) {
O << ".param .b" << getPointerTy().getSizeInBits() << " _";
} else {
- if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) {
- SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*this, retTy, vtparts);
- unsigned totalsz = 0;
- for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
- unsigned elems = 1;
- EVT elemtype = vtparts[i];
- if (vtparts[i].isVector()) {
- elems = vtparts[i].getVectorNumElements();
- elemtype = vtparts[i].getVectorElementType();
- }
- // TODO: no need to loop
- for (unsigned j = 0, je = elems; j != je; ++j) {
- unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 8))
- sz = 8;
- totalsz += sz / 8;
- }
- }
- O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]";
+ if((retTy->getTypeID() == Type::StructTyID) ||
+ isa<VectorType>(retTy)) {
+ O << ".param .align "
+ << retAlignment
+ << " .b8 _["
+ << getDataLayout()->getTypeAllocSize(retTy) << "]";
} else {
assert(false && "Unknown return type");
}
@@ -675,7 +697,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Ty->isAggregateType()) {
// aggregate
SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*this, Ty, vtparts);
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
// declare .param .align <align> .b8 .param<n>[<size>];
@@ -687,34 +710,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
DeclareParamOps);
InFlag = Chain.getValue(1);
- unsigned curOffset = 0;
for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
- unsigned elems = 1;
EVT elemtype = vtparts[j];
- if (vtparts[j].isVector()) {
- elems = vtparts[j].getVectorNumElements();
- elemtype = vtparts[j].getVectorElementType();
- }
- for (unsigned k = 0, ke = elems; k != ke; ++k) {
- unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 8))
- sz = 8;
- SDValue StVal = OutVals[OIdx];
- if (elemtype.getSizeInBits() < 16) {
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
- SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CopyParamOps[] = { Chain,
- DAG.getConstant(paramCount, MVT::i32),
- DAG.getConstant(curOffset, MVT::i32),
- StVal, InFlag };
- Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
- CopyParamVTs, CopyParamOps,
- elemtype, MachinePointerInfo());
- InFlag = Chain.getValue(1);
- curOffset += sz / 8;
- ++OIdx;
+ unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+ if (elemtype.isInteger() && (sz < 8))
+ sz = 8;
+ SDValue StVal = OutVals[OIdx];
+ if (elemtype.getSizeInBits() < 16) {
+ StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
}
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(Offsets[j], MVT::i32),
+ StVal, InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+ CopyParamVTs, CopyParamOps,
+ elemtype, MachinePointerInfo(),
+ ArgAlign);
+ InFlag = Chain.getValue(1);
+ ++OIdx;
}
if (vtparts.size() > 0)
--OIdx;
@@ -899,13 +914,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
// struct or vector
SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> Offsets;
const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
assert(PTy && "Type of a byval parameter should be pointer");
- ComputeValueVTs(*this, PTy->getElementType(), vtparts);
+ ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
// declare .param .align <align> .b8 .param<n>[<size>];
unsigned sz = Outs[OIdx].Flags.getByValSize();
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
// The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
// so we don't need to worry about natural alignment or not.
// See TargetLowering::LowerCallTo().
@@ -917,38 +934,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
DeclareParamOps);
InFlag = Chain.getValue(1);
- unsigned curOffset = 0;
for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
- unsigned elems = 1;
EVT elemtype = vtparts[j];
- if (vtparts[j].isVector()) {
- elems = vtparts[j].getVectorNumElements();
- elemtype = vtparts[j].getVectorElementType();
+ int curOffset = Offsets[j];
+ unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+ SDValue srcAddr =
+ DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
+ DAG.getConstant(curOffset, getPointerTy()));
+ SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+ MachinePointerInfo(), false, false, false,
+ PartAlign);
+ if (elemtype.getSizeInBits() < 16) {
+ theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
}
- for (unsigned k = 0, ke = elems; k != ke; ++k) {
- unsigned sz = elemtype.getSizeInBits();
- if (elemtype.isInteger() && (sz < 8))
- sz = 8;
- SDValue srcAddr =
- DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
- DAG.getConstant(curOffset, getPointerTy()));
- SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
- MachinePointerInfo(), false, false, false,
- 0);
- if (elemtype.getSizeInBits() < 16) {
- theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
- }
- SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
- DAG.getConstant(curOffset, MVT::i32), theVal,
- InFlag };
- Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
- CopyParamOps, elemtype,
- MachinePointerInfo());
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(curOffset, MVT::i32), theVal,
+ InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+ CopyParamOps, elemtype,
+ MachinePointerInfo());
- InFlag = Chain.getValue(1);
- curOffset += sz / 8;
- }
+ InFlag = Chain.getValue(1);
}
++paramCount;
}
@@ -1057,7 +1064,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Generate loads from param memory/moves from registers for result
if (Ins.size() > 0) {
- unsigned resoffset = 0;
if (retTy && retTy->isVectorTy()) {
EVT ObjectVT = getValueType(retTy);
unsigned NumElts = ObjectVT.getVectorNumElements();
@@ -1066,14 +1072,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ObjectVT) == NumElts &&
"Vector was not scalarized");
unsigned sz = EltVT.getSizeInBits();
- bool needTruncate = sz < 16 ? true : false;
+ bool needTruncate = sz < 8 ? true : false;
if (NumElts == 1) {
// Just a simple load
SmallVector<EVT, 4> LoadRetVTs;
- if (needTruncate) {
- // If loading i1 result, generate
- // load i16
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
// trunc i16 to i1
LoadRetVTs.push_back(MVT::i16);
} else
@@ -1097,9 +1104,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (NumElts == 2) {
// LoadV2
SmallVector<EVT, 4> LoadRetVTs;
- if (needTruncate) {
- // If loading i1 result, generate
- // load i16
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
// trunc i16 to i1
LoadRetVTs.push_back(MVT::i16);
LoadRetVTs.push_back(MVT::i16);
@@ -1142,9 +1150,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
for (unsigned i = 0; i < NumElts; i += VecSize) {
SmallVector<EVT, 8> LoadRetVTs;
- if (needTruncate) {
- // If loading i1 result, generate
- // load i16
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
// trunc i16 to i1
for (unsigned j = 0; j < VecSize; ++j)
LoadRetVTs.push_back(MVT::i16);
@@ -1183,10 +1192,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
} else {
SmallVector<EVT, 16> VTs;
- ComputePTXValueVTs(*this, retTy, VTs);
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
assert(VTs.size() == Ins.size() && "Bad value decomposition");
+ unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
unsigned sz = VTs[i].getSizeInBits();
+ unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
bool needTruncate = sz < 8 ? true : false;
if (VTs[i].isInteger() && (sz < 8))
sz = 8;
@@ -1212,19 +1224,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 4> LoadRetOps;
LoadRetOps.push_back(Chain);
LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
- LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
+ LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
LoadRetOps.push_back(InFlag);
SDValue retval = DAG.getMemIntrinsicNode(
NVPTXISD::LoadParam, dl,
DAG.getVTList(LoadRetVTs), LoadRetOps,
- TheLoadType, MachinePointerInfo());
+ TheLoadType, MachinePointerInfo(), AlignI);
Chain = retval.getValue(1);
InFlag = retval.getValue(2);
SDValue Ret0 = retval.getValue(0);
if (needTruncate)
Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
InVals.push_back(Ret0);
- resoffset += sz / 8;
}
}
}
@@ -1262,6 +1273,127 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
}
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // dHi = aHi >> Amt
+ // dLo = shf.r.clamp aLo, aHi, Amt
+
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // - if (Amt>=size) then
+ // dLo = aHi >> (Amt-size)
+ // dHi = aHi >> Amt (this is either all 0 or all 1)
+ // else
+ // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+ // dHi = aHi >> Amt
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // dHi = shf.l.clamp aLo, aHi, Amt
+ // dLo = aLo << Amt
+
+ SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // - if (Amt>=size) then
+ // dLo = aLo << Amt (all 0)
+ // dLo = aLo << (Amt-size)
+ // else
+ // dLo = aLo << Amt
+ // dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -1282,6 +1414,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTORE(Op, DAG);
case ISD::LOAD:
return LowerLOAD(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS:
+ return LowerShiftRightParts(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
}
@@ -1495,7 +1632,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
const Function *F = MF.getFunction();
const AttributeSet &PAL = F->getAttributes();
- const TargetLowering *TLI = nvTM->getTargetLowering();
+ const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
SDValue Root = DAG.getRoot();
std::vector<SDValue> OutChains;
@@ -1549,8 +1686,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
assert(vtparts.size() > 0 && "empty aggregate type not expected");
for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
++parti) {
- EVT partVT = vtparts[parti];
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT));
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
++InsIdx;
}
if (vtparts.size() > 0)
@@ -1866,7 +2002,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
unsigned Offset = 0;
EVT VecVT =
- EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize);
+ EVT::getVectorVT(F->getContext(), EltVT, VecSize);
unsigned PerStoreOffset =
TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
@@ -1925,12 +2061,10 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
} else {
SmallVector<EVT, 16> ValVTs;
- // const_cast is necessary since we are still using an LLVM version from
- // before the type system re-write.
- ComputePTXValueVTs(*this, RetTy, ValVTs);
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
- unsigned SizeSoFar = 0;
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
SDValue theVal = OutVals[i];
EVT TheValType = theVal.getValueType();
@@ -1954,16 +2088,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
else if (TmpVal.getValueType().getSizeInBits() < 16)
TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
- SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
+ SDValue Ops[] = {
+ Chain,
+ DAG.getConstant(Offsets[i], MVT::i32),
+ TmpVal };
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
DAG.getVTList(MVT::Other), Ops,
TheStoreType,
MachinePointerInfo());
- if(TheValType.isVector())
- SizeSoFar +=
- TheStoreType.getVectorElementType().getStoreSizeInBits() / 8;
- else
- SizeSoFar += TheStoreType.getStoreSizeInBits()/8;
}
}
}
@@ -2220,22 +2352,62 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p:
+ case Intrinsic::nvvm_ldu_global_p: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
Info.memVT = getValueType(I.getType());
- else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
+ else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+ Info.memVT = getPointerTy();
+ else
Info.memVT = getValueType(I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+
+ // alignment is available as metadata.
+ // Grab it and set the alignment.
+ assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+ MDNode *AlignMD = I.getMetadata("align");
+ assert(AlignMD && "Must have a non-null MDNode");
+ assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+ Value *Align = AlignMD->getOperand(0);
+ int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+ Info.align = Alignment;
+
+ return true;
+ }
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p: {
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+ Info.memVT = getValueType(I.getType());
+ else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+ Info.memVT = getPointerTy();
else
- Info.memVT = MVT::f32;
+ Info.memVT = getValueType(I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.vol = 0;
Info.readMem = true;
Info.writeMem = false;
- Info.align = 0;
+
+ // alignment is available as metadata.
+ // Grab it and set the alignment.
+ assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+ MDNode *AlignMD = I.getMetadata("align");
+ assert(AlignMD && "Must have a non-null MDNode");
+ assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+ Value *Align = AlignMD->getOperand(0);
+ int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+ Info.align = Alignment;
+
return true;
+ }
case Intrinsic::nvvm_tex_1d_v4f32_i32:
case Intrinsic::nvvm_tex_1d_v4f32_f32:
@@ -2427,6 +2599,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
switch (Constraint[0]) {
default:
break;
+ case 'b':
case 'r':
case 'h':
case 'c':
@@ -2446,6 +2619,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
+ case 'b':
+ return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
case 'c':
return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
case 'h':
@@ -2469,6 +2644,406 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
return 4;
}
+//===----------------------------------------------------------------------===//
+// NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+extern unsigned FMAContractLevel;
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1. This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SelectionDAG &DAG = DCI.DAG;
+ // Skip non-integer, non-scalar case
+ EVT VT=N0.getValueType();
+ if (VT.isVector())
+ return SDValue();
+
+ // fold (add (mul a, b), c) -> (mad a, b, c)
+ //
+ if (N0.getOpcode() == ISD::MUL) {
+ assert (VT.isInteger());
+ // For integer:
+ // Since integer multiply-add costs the same as integer multiply
+ // but is more costly than integer add, do the fusion only when
+ // the mul is only used in the add.
+ if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+ !N0.getNode()->hasOneUse())
+ return SDValue();
+
+ // Do the folding
+ return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ else if (N0.getOpcode() == ISD::FMUL) {
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ if (FMAContractLevel == 0)
+ return SDValue();
+
+ // For floating point:
+ // Do the fusion only when the mul has less than 5 uses and all
+ // are add.
+ // The heuristic is that if a use is not an add, then that use
+ // cannot be fused into fma, therefore mul is still needed anyway.
+ // If there are more than 4 uses, even if they are all add, fusing
+ // them will increase register pressue.
+ //
+ int numUses = 0;
+ int nonAddCount = 0;
+ for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+ UE = N0.getNode()->use_end();
+ UI != UE; ++UI) {
+ numUses++;
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::FADD)
+ ++nonAddCount;
+ }
+ if (numUses >= 5)
+ return SDValue();
+ if (nonAddCount) {
+ int orderNo = N->getIROrder();
+ int orderNo2 = N0.getNode()->getIROrder();
+ // simple heuristics here for considering potential register
+ // pressure, the logics here is that the differnce are used
+ // to measure the distance between def and use, the longer distance
+ // more likely cause register pressure.
+ if (orderNo - orderNo2 < 500)
+ return SDValue();
+
+ // Now, check if at least one of the FMUL's operands is live beyond the node N,
+ // which guarantees that the FMA will not increase register pressure at node N.
+ bool opIsLive = false;
+ const SDNode *left = N0.getOperand(0).getNode();
+ const SDNode *right = N0.getOperand(1).getNode();
+
+ if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
+ opIsLive = true;
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // First try with the default operand order.
+ SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
+ OptLevel);
+ if (Result.getNode())
+ return Result;
+
+ // If that didn't work, try again with the operands commuted.
+ return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // The type legalizer turns a vector load of i8 values into a zextload to i16
+ // registers, optionally ANY_EXTENDs it (if target type is integer),
+ // and ANDs off the high 8 bits. Since we turn this load into a
+ // target-specific DAG node, the DAG combiner fails to eliminate these AND
+ // nodes. Do that here.
+ SDValue Val = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+
+ if (isa<ConstantSDNode>(Val)) {
+ std::swap(Val, Mask);
+ }
+
+ SDValue AExt;
+ // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+ if (Val.getOpcode() == ISD::ANY_EXTEND) {
+ AExt = Val;
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+ Val->getOpcode() == NVPTXISD::LoadV4) {
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+ if (!MaskCnst) {
+ // Not an AND with a constant
+ return SDValue();
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ if (MaskVal != 0xff) {
+ // Not an AND that chops off top 8 bits
+ return SDValue();
+ }
+
+ MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+ if (!Mem) {
+ // Not a MemSDNode?!?
+ return SDValue();
+ }
+
+ EVT MemVT = Mem->getMemoryVT();
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+ // We only handle the i8 case
+ return SDValue();
+ }
+
+ unsigned ExtType =
+ cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+ getZExtValue();
+ if (ExtType == ISD::SEXTLOAD) {
+ // If for some reason the load is a sextload, the and is needed to zero
+ // out the high 8 bits
+ return SDValue();
+ }
+
+ bool AddTo = false;
+ if (AExt.getNode() != 0) {
+ // Re-insert the ext as a zext.
+ Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+ AExt.getValueType(), Val);
+ AddTo = true;
+ }
+
+ // If we get here, the AND is unnecessary. Just replace it with the load
+ DCI.CombineTo(N, Val, AddTo);
+ }
+
+ return SDValue();
+}
+
+enum OperandSignedness {
+ Signed = 0,
+ Unsigned,
+ Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+ unsigned OptSize,
+ OperandSignedness &S) {
+ S = Unknown;
+
+ if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+ Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() == OptSize) {
+ S = Signed;
+ return true;
+ }
+ } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() == OptSize) {
+ S = Unsigned;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+ unsigned OptSize,
+ bool &IsSigned) {
+
+ OperandSignedness LHSSign;
+
+ // The LHS operand must be a demotable op
+ if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+ return false;
+
+ // We should have been able to determine the signedness from the LHS
+ if (LHSSign == Unknown)
+ return false;
+
+ IsSigned = (LHSSign == Signed);
+
+ // The RHS can be a demotable op or a constant
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+ APInt Val = CI->getAPIntValue();
+ if (LHSSign == Unsigned) {
+ if (Val.isIntN(OptSize)) {
+ return true;
+ }
+ return false;
+ } else {
+ if (Val.isSignedIntN(OptSize)) {
+ return true;
+ }
+ return false;
+ }
+ } else {
+ OperandSignedness RHSSign;
+ if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+ return false;
+
+ if (LHSSign != RHSSign)
+ return false;
+
+ return true;
+ }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT MulType = N->getValueType(0);
+ if (MulType != MVT::i32 && MulType != MVT::i64) {
+ return SDValue();
+ }
+
+ unsigned OptSize = MulType.getSizeInBits() >> 1;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Canonicalize the multiply so the constant (if any) is on the right
+ if (N->getOpcode() == ISD::MUL) {
+ if (isa<ConstantSDNode>(LHS)) {
+ std::swap(LHS, RHS);
+ }
+ }
+
+ // If we have a SHL, determine the actual multiply amount
+ if (N->getOpcode() == ISD::SHL) {
+ ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShlRHS) {
+ return SDValue();
+ }
+
+ APInt ShiftAmt = ShlRHS->getAPIntValue();
+ unsigned BitWidth = MulType.getSizeInBits();
+ if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+ APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, MulType);
+ } else {
+ return SDValue();
+ }
+ }
+
+ bool Signed;
+ // Verify that our operands are demotable
+ if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+ return SDValue();
+ }
+
+ EVT DemotedVT;
+ if (MulType == MVT::i32) {
+ DemotedVT = MVT::i16;
+ } else {
+ DemotedVT = MVT::i32;
+ }
+
+ // Truncate the operands to the correct size. Note that these are just for
+ // type consistency and will (likely) be eliminated in later phases.
+ SDValue TruncLHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
+ SDValue TruncRHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
+
+ unsigned Opc;
+ if (Signed) {
+ Opc = NVPTXISD::MUL_WIDE_SIGNED;
+ } else {
+ Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+ }
+
+ return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ SDValue Ret = TryMULWIDECombine(N, DCI);
+ if (Ret.getNode())
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ SDValue Ret = TryMULWIDECombine(N, DCI);
+ if (Ret.getNode())
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ // FIXME: Get this from the DAG somehow
+ CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::FADD:
+ return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
+ case ISD::MUL:
+ return PerformMULCombine(N, DCI, OptLevel);
+ case ISD::SHL:
+ return PerformSHLCombine(N, DCI, OptLevel);
+ case ISD::AND:
+ return PerformANDCombine(N, DCI);
+ }
+ return SDValue();
+}
+
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 7bad8a2..7b4026d 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -16,7 +16,6 @@
#define NVPTXISELLOWERING_H
#include "NVPTX.h"
-#include "NVPTXSubtarget.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/Target/TargetLowering.h"
@@ -50,6 +49,11 @@ enum NodeType {
CallSeqBegin,
CallSeqEnd,
CallPrototype,
+ FUN_SHFL_CLAMP,
+ FUN_SHFR_CLAMP,
+ MUL_WIDE_SIGNED,
+ MUL_WIDE_UNSIGNED,
+ IMAD,
Dummy,
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -167,6 +171,8 @@ enum NodeType {
};
}
+class NVPTXSubtarget;
+
//===--------------------------------------------------------------------===//
// TargetLowering Implementation
//===--------------------------------------------------------------------===//
@@ -196,9 +202,9 @@ public:
/// getFunctionAlignment - Return the Log2 alignment of this function.
unsigned getFunctionAlignment(const Function *F) const;
- EVT getSetCCResultType(LLVMContext &, EVT VT) const override {
+ EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
if (VT.isVector())
- return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
return MVT::i1;
}
@@ -236,7 +242,8 @@ public:
// PTX always uses 32-bit shift amounts
MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
- bool shouldSplitVectorType(EVT VT) const override;
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
private:
const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
@@ -255,8 +262,12 @@ private:
SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
Type *Ty, unsigned Idx) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 397f4bc..a98fb37 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -146,7 +146,7 @@ bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
// We implement "poor man's DCE" here to make sure any code that is no longer
// live is actually unreachable and can be trivially eliminated by the
- // unreachable block elimiation pass.
+ // unreachable block elimination pass.
for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
UI != UE; ++UI) {
if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index cdc8088..b5b4fbe 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -29,8 +29,8 @@ using namespace llvm;
void NVPTXInstrInfo::anchor() {}
// FIXME: Add the subtarget support on this constructor.
-NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
- : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {}
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI)
+ : NVPTXGenInstrInfo(), RegInfo(STI) {}
void NVPTXInstrInfo::copyPhysReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 88a9e45..2ac2974 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -24,11 +24,10 @@
namespace llvm {
class NVPTXInstrInfo : public NVPTXGenInstrInfo {
- NVPTXTargetMachine &TM;
const NVPTXRegisterInfo RegInfo;
virtual void anchor();
public:
- explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+ explicit NVPTXInstrInfo(NVPTXSubtarget &STI);
const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index fbcd0e4..d2c0373 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -158,9 +158,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
def true : Predicate<"1">;
+def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">;
+
//===----------------------------------------------------------------------===//
// Some Common Instruction Class Templates
@@ -461,33 +464,45 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
}]>;
-def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b),
+def MULWIDES64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
"mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int32Regs:$a, i64imm:$b),
+def MULWIDES64Imm64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
"mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b),
+def MULWIDEU64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
"mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
- (ins Int32Regs:$a, i64imm:$b),
+def MULWIDEU64Imm64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
"mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b),
+def MULWIDES32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
"mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$a, i32imm:$b),
+def MULWIDES32Imm
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
"mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$a, i32imm:$b),
+def MULWIDEU32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
"mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
(MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
@@ -507,25 +522,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
(MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
Requires<[doMulWide]>;
def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
+ (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
Requires<[doMulWide]>;
def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
+ (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
Requires<[doMulWide]>;
def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
- (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
+ (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
Requires<[doMulWide]>;
def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
+ (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+ Requires<[doMulWide]>;
+
+
+def SDTMulWide
+ : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed
+ : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned
+ : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+ (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+ (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
Requires<[doMulWide]>;
defm MULT : I3<"mul.lo.s", mul>;
@@ -541,69 +594,75 @@ defm SREM : I3<"rem.s", srem>;
defm UREM : I3<"rem.u", urem>;
// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+def SDTIMAD
+ : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
+ SDTCisInt<2>, SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def imad
+ : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
"mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (add
- (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
"mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (add
- (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
"mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (add
- (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, i16imm:$b, i16imm:$c),
"mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
- imm:$c))]>;
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, imm:$b, imm:$c))]>;
def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
"mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (add
- (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
"mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (add
- (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
"mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (add
- (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, i32imm:$b, i32imm:$c),
"mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (add
- (mul Int32Regs:$a, imm:$b), imm:$c))]>;
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, imm:$b, imm:$c))]>;
def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
(ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
"mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (add
- (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
(ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
"mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (add
- (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
(ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
"mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (add
- (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
(ins Int64Regs:$a, i64imm:$b, i64imm:$c),
"mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (add
- (mul Int64Regs:$a, imm:$b), imm:$c))]>;
-
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, imm:$b, imm:$c))]>;
def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"neg.s16 \t$dst, $src;",
@@ -809,36 +868,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
def rrr : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst, (fadd
- (fmul Float32Regs:$a, Float32Regs:$b),
- Float32Regs:$c))]>, Requires<[Pred]>;
- // This is to WAR a weird bug in Tablegen that does not automatically
- // generate the following permutated rule rrr2 from the above rrr.
- // So we explicitly add it here. This happens to FMA32 only.
- // See the comments at FMAD32 and FMA32 for more information.
- def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst, (fadd Float32Regs:$c,
- (fmul Float32Regs:$a, Float32Regs:$b)))]>,
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
Requires<[Pred]>;
def rri : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst, (fadd
- (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
Requires<[Pred]>;
def rir : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst, (fadd
- (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
Requires<[Pred]>;
def rii : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b, f32imm:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float32Regs:$dst, (fadd
- (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
Requires<[Pred]>;
}
@@ -846,73 +895,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
def rrr : NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst, (fadd
- (fmul Float64Regs:$a, Float64Regs:$b),
- Float64Regs:$c))]>, Requires<[Pred]>;
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
+ Requires<[Pred]>;
def rri : NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
- Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
def rir : NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst, (fadd
- (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
Requires<[Pred]>;
def rii : NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b, f64imm:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set Float64Regs:$dst, (fadd
- (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
Requires<[Pred]>;
}
-// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
-// automatically generate the rrr2 rule from
-// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
-// If we reverse the order of the following two lines, then rrr2 rule will be
-// generated for FMA32, but not for rrr.
-// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
-defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
-defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
-defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
-
-// b*c-a => fmad(b, c, -a)
-multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
- def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
- (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
- Requires<[Pred]>;
-}
-
-// a-b*c => fmad(-b,c, a)
-// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
-// b*c-a => fmad(b, c, -a)
-// - legal because b*c-a <=> b*c+(-a)
-multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
- def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
- (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
- Requires<[Pred]>;
- def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
- (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
- Requires<[Pred]>;
-}
-
-// a-b*c => fmad(-b,c, a)
-// b*c-a => fmad(b, c, -a)
-multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
- def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
- (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
- Requires<[Pred]>;
-
- def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
- (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
- Requires<[Pred]>;
-}
-
-defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
-defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
-defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
+defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
+defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
+defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"sin.approx.f32 \t$dst, $src;",
@@ -1083,6 +1091,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
+//
+// Rotate: use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+// =>
+// r2 = shf.l r1, r1, n
+def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+// =>
+// r2 = shf.r r1, r1, n
+def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+//
+// Rotate: if ptx shf instruction is not available, then use shift+add
+//
// 32bit
def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
@@ -1100,9 +1145,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{
}]>;
def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
+ (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
Int32Regs:$amt),
@@ -1115,7 +1162,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
!strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
!strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
!strconcat("}}", ""))))))))),
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
Int32Regs:$amt),
@@ -1128,7 +1176,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
!strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
!strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
!strconcat("}}", ""))))))))),
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
// 64bit
def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
@@ -1177,6 +1226,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
!strconcat("}}", ""))))))))),
[(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+// BFE - bit-field extract
+
+multiclass BFE<string TyStr, RegisterClass RC> {
+ // BFE supports both 32-bit and 64-bit values, but the start and length
+ // operands are always 32-bit
+ def rrr
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rri
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rii
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, i32imm:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+defm BFE_S32 : BFE<"s32", Int32Regs>;
+defm BFE_U32 : BFE<"u32", Int32Regs>;
+defm BFE_S64 : BFE<"s64", Int64Regs>;
+defm BFE_U64 : BFE<"u64", Int64Regs>;
//-----------------------------------
// General Comparison
@@ -1292,6 +1364,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
(ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
(ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+//
+// Funnnel shift in clamp mode
+//
+// - SDNodes are created so they can be used in the DAG code,
+// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+//
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFL_CLAMP Int32Regs:$lo,
+ Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFR_CLAMP Int32Regs:$lo,
+ Int32Regs:$hi, Int32Regs:$amt))]>;
+
//-----------------------------------
// Data Movement (Load / Store, Move)
//-----------------------------------
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5e228fc..0ad3dfa 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1057,12 +1057,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_max_32 node:$a, node:$b)>;
def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
(atomic_load_umax_32 node:$a, node:$b)>;
def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_umax_32 node:$a, node:$b)>;
def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1072,6 +1084,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+ atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1080,6 +1100,14 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+ atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
// atom_min
@@ -1089,12 +1117,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_min_32 node:$a, node:$b)>;
def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
(atomic_load_umin_32 node:$a, node:$b)>;
def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_umin_32 node:$a, node:$b)>;
def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1104,6 +1144,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+ atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1112,6 +1160,14 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+ atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
// atom_inc atom_dec
@@ -1153,6 +1209,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_and_32 node:$a, node:$b)>;
def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1162,6 +1224,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+ atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+ atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+ atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
// atom_or
@@ -1171,6 +1241,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_or_32 node:$a, node:$b)>;
def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1180,6 +1256,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+ atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+ atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+ atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
// atom_xor
@@ -1189,6 +1273,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(atomic_load_xor_32 node:$a, node:$b)>;
def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1198,6 +1288,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+ atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+ atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+ atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
// atom_cas
@@ -1276,67 +1374,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
// Support for ldu on sm_20 or later
//-----------------------------------
-def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{
- MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
- return M->getMemoryVT() == MVT::i8;
-}]>;
-
// Scalar
-// @TODO: Revisit this, Changed imemAny to imem
-multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+ []>, Requires<[hasLDU]>;
def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
!strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
- def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src),
+ []>, Requires<[hasLDU]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasLDU]>;
+ []>, Requires<[hasLDU]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
!strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+ []>, Requires<[hasLDU]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
!strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+ []>, Requires<[hasLDU]>;
}
-multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
- def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
- !strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
- def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
- !strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
- def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src),
- !strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasLDU]>;
- def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
- def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr),
- [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
-}
-
-defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs,
- ldu_i8>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs,
-int_nvvm_ldu_global_f>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs,
-int_nvvm_ldu_global_f>;
-defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
-int_nvvm_ldu_global_p>;
-defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
-int_nvvm_ldu_global_p>;
+defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
// vector
@@ -1406,65 +1470,40 @@ defm INT_PTX_LDU_G_v4f32_ELE
// Support for ldg on sm_35 or later
//-----------------------------------
-def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{
- MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
- return M->getMemoryVT() == MVT::i8;
-}]>;
-
-multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+ []>, Requires<[hasLDG]>;
def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
!strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
- def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src),
+ []>, Requires<[hasLDG]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasLDG]>;
+ []>, Requires<[hasLDG]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+ []>, Requires<[hasLDG]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
!strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
-}
-
-multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
- def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
- !strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
- def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
- !strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
- def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src),
- !strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
- Requires<[hasLDG]>;
- def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
- def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr),
- [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+ []>, Requires<[hasLDG]>;
}
defm INT_PTX_LDG_GLOBAL_i8
- : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>;
+ : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i16
- : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>;
+ : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i32
- : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>;
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
defm INT_PTX_LDG_GLOBAL_i64
- : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>;
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
defm INT_PTX_LDG_GLOBAL_f32
- : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>;
+ : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
defm INT_PTX_LDG_GLOBAL_f64
- : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>;
+ : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
defm INT_PTX_LDG_GLOBAL_p32
- : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>;
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
defm INT_PTX_LDG_GLOBAL_p64
- : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>;
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
// vector
@@ -1689,6 +1728,207 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
[(int_nvvm_compiler_error Int64Regs:$a)]>;
+// isspacep
+
+def ISSPACEP_CONST_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+ (ins SpecialRegs:$r),
+ "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+ (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]> ;
+
+def GET_LO_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+def GET_HI_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+def PACK_TWO_INT32
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+ "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+ (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src))> ;
+
+// funnel shift, requires >= sm_32
+def SHF_L_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_L_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+
+
//-----------------------------------
// Texture Intrinsics
//-----------------------------------
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 0ee018c..5547649 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -66,7 +66,7 @@ public:
const MCAsmLayout *Layout) const override {
return false;
}
- void AddValueSymbols(MCAssembler *) const override {};
+ void visitUsedExpr(MCStreamer &Streamer) const override {};
const MCSection *FindAssociatedSection() const override {
return nullptr;
}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 7a38a66..3482248 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -46,6 +46,10 @@ foreach i = 0-4 in {
def da#i : NVPTXReg<"%da"#i>;
}
+foreach i = 0-31 in {
+ def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
//===----------------------------------------------------------------------===//
// Register classes
//===----------------------------------------------------------------------===//
@@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
+ (sequence "ENVREG%u", 0, 31))>;
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 8c7df52..d5cded2 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -25,10 +25,41 @@ using namespace llvm;
// Pin the vtable to this file.
void NVPTXSubtarget::anchor() {}
+static std::string computeDataLayout(bool is64Bit) {
+ std::string Ret = "e";
+
+ if (!is64Bit)
+ Ret += "-p:32:32";
+
+ Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+ return Ret;
+}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ // Provide the default CPU if we don't have one.
+ if (CPU.empty() && FS.size())
+ llvm_unreachable("we are not using FeatureStr");
+ TargetName = CPU.empty() ? "sm_20" : CPU;
+
+ ParseSubtargetFeatures(TargetName, FS);
+
+ // Set default to PTX 3.2 (CUDA 5.5)
+ if (PTXVersion == 0) {
+ PTXVersion = 32;
+ }
+
+ return *this;
+}
+
NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS, bool is64Bit)
+ const std::string &FS, const TargetMachine &TM,
+ bool is64Bit)
: NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
- SmVersion(20) {
+ SmVersion(20), DL(computeDataLayout(is64Bit)),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) {
Triple T(TT);
@@ -36,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
drvInterface = NVPTX::NVCL;
else
drvInterface = NVPTX::CUDA;
-
- // Provide the default CPU if none
- std::string defCPU = "sm_20";
-
- ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
-
- // Get the TargetName from the FS if available
- if (FS.empty() && CPU.empty())
- TargetName = defCPU;
- else if (!CPU.empty())
- TargetName = CPU;
- else
- llvm_unreachable("we are not using FeatureStr");
-
- // We default to PTX 3.1, but we cannot just default to it in the initializer
- // since the attribute parser checks if the given option is >= the default.
- // So if we set ptx31 as the default, the ptx30 attribute would never match.
- // Instead, we use 0 as the default and manually set 31 if the default is
- // used.
- if (PTXVersion == 0) {
- PTXVersion = 31;
- }
}
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 581e5ed..3ed5747 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -15,6 +15,12 @@
#define NVPTXSUBTARGET_H
#include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -35,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
unsigned int SmVersion;
+ const DataLayout DL; // Calculates type size & alignment
+ NVPTXInstrInfo InstrInfo;
+ NVPTXTargetLowering TLInfo;
+ TargetSelectionDAGInfo TSInfo;
+
+ // NVPTX does not have any call stack frame, but need a NVPTX specific
+ // FrameLowering class because TargetFrameLowering is abstract.
+ NVPTXFrameLowering FrameLowering;
+
public:
/// This constructor initializes the data members to match that
/// of the specified module.
///
NVPTXSubtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS, bool is64Bit);
+ const std::string &FS, const TargetMachine &TM, bool is64Bit);
+
+ const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+ const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+ const DataLayout *getDataLayout() const { return &DL; }
+ const NVPTXRegisterInfo *getRegisterInfo() const {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; }
+ const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
bool hasBrkPt() const { return SmVersion >= 11; }
bool hasAtomRedG32() const { return SmVersion >= 11; }
@@ -57,10 +81,12 @@ public:
bool hasFMAF32() const { return SmVersion >= 20; }
bool hasFMAF64() const { return SmVersion >= 13; }
bool hasLDG() const { return SmVersion >= 32; }
- bool hasLDU() const { return SmVersion >= 20; }
+ bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
bool hasGenericLdSt() const { return SmVersion >= 20; }
- inline bool hasHWROT32() const { return false; }
- inline bool hasSWROT32() const { return true; }
+ inline bool hasHWROT32() const { return SmVersion >= 32; }
+ inline bool hasSWROT32() const {
+ return ((SmVersion >= 20) && (SmVersion < 32));
+ }
inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
inline bool hasROT64() const { return SmVersion >= 20; }
@@ -76,6 +102,7 @@ public:
unsigned getPTXVersion() const { return PTXVersion; }
+ NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
};
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 26a4f84..069a1b9 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -66,26 +66,13 @@ extern "C" void LLVMInitializeNVPTXTarget() {
*PassRegistry::getPassRegistry());
}
-static std::string computeDataLayout(const NVPTXSubtarget &ST) {
- std::string Ret = "e";
-
- if (!ST.is64Bit())
- Ret += "-p:32:32";
-
- Ret += "-i64:64-v16:16-v32:32-n16:32:64";
-
- return Ret;
-}
-
-NVPTXTargetMachine::NVPTXTargetMachine(
- const Target &T, StringRef TT, StringRef CPU, StringRef FS,
- const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OL, bool is64bit)
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool is64bit)
: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)),
- InstrInfo(*this), TLInfo(*this), TSInfo(*this),
- FrameLowering(
- *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+ Subtarget(TT, CPU, FS, *this, is64bit) {
initAsmInfo();
}
@@ -119,6 +106,7 @@ public:
bool addInstSelector() override;
bool addPreRegAlloc() override;
bool addPostRegAlloc() override;
+ void addMachineSSAOptimization() override;
FunctionPass *createTargetRegisterAllocator(bool) override;
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -220,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
printAndVerify("After StackSlotColoring");
}
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+ // Pre-ra tail duplication.
+ if (addPass(&EarlyTailDuplicateID))
+ printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+ // Optimize PHIs before DCE: removing dead PHI cycles may make more
+ // instructions dead.
+ addPass(&OptimizePHIsID);
+
+ // This pass merges large allocas. StackSlotColoring is a different pass
+ // which merges spill slots.
+ addPass(&StackColoringID);
+
+ // If the target requests it, assign local variables to stack slots relative
+ // to one another and simplify frame index references where possible.
+ addPass(&LocalStackSlotAllocationID);
+
+ // With optimization, dead code should already be eliminated. However
+ // there is one known exception: lowered code for arguments that are only
+ // used by tail calls, where the tail calls reuse the incoming stack
+ // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+ addPass(&DeadMachineInstructionElimID);
+ printAndVerify("After codegen DCE pass");
+
+ // Allow targets to insert passes that improve instruction level parallelism,
+ // like if-conversion. Such passes will typically need dominator trees and
+ // loop info, just like LICM and CSE below.
+ if (addILPOpts())
+ printAndVerify("After ILP optimizations");
+
+ addPass(&MachineLICMID);
+ addPass(&MachineCSEID);
+
+ addPass(&MachineSinkingID);
+ printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+ addPass(&PeepholeOptimizerID);
+ printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 2db7c18..a7a1c8f 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -14,13 +14,8 @@
#ifndef NVPTX_TARGETMACHINE_H
#define NVPTX_TARGETMACHINE_H
-#include "ManagedStringPool.h"
-#include "NVPTXFrameLowering.h"
-#include "NVPTXISelLowering.h"
-#include "NVPTXInstrInfo.h"
-#include "NVPTXRegisterInfo.h"
#include "NVPTXSubtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "ManagedStringPool.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSelectionDAGInfo.h"
@@ -31,50 +26,37 @@ namespace llvm {
///
class NVPTXTargetMachine : public LLVMTargetMachine {
NVPTXSubtarget Subtarget;
- const DataLayout DL; // Calculates type size & alignment
- NVPTXInstrInfo InstrInfo;
- NVPTXTargetLowering TLInfo;
- TargetSelectionDAGInfo TSInfo;
-
- // NVPTX does not have any call stack frame, but need a NVPTX specific
- // FrameLowering class because TargetFrameLowering is abstract.
- NVPTXFrameLowering FrameLowering;
// Hold Strings that can be free'd all together with NVPTXTargetMachine
ManagedStringPool ManagedStrPool;
- //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
- // bool DisableVerify, MCContext *&OutCtx);
-
public:
NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
const TargetOptions &Options, Reloc::Model RM,
CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
const TargetFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
+ return getSubtargetImpl()->getFrameLowering();
+ }
+ const NVPTXInstrInfo *getInstrInfo() const override {
+ return getSubtargetImpl()->getInstrInfo();
+ }
+ const DataLayout *getDataLayout() const override {
+ return getSubtargetImpl()->getDataLayout();
}
- const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
- const DataLayout *getDataLayout() const override { return &DL; }
const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-
const NVPTXRegisterInfo *getRegisterInfo() const override {
- return &(InstrInfo.getRegisterInfo());
+ return getSubtargetImpl()->getRegisterInfo();
}
- NVPTXTargetLowering *getTargetLowering() const override {
- return const_cast<NVPTXTargetLowering *>(&TLInfo);
+ const NVPTXTargetLowering *getTargetLowering() const override {
+ return getSubtargetImpl()->getTargetLowering();
}
const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
+ return getSubtargetImpl()->getSelectionDAGInfo();
}
- //virtual bool addInstSelector(PassManagerBase &PM,
- // CodeGenOpt::Level OptLevel);
-
- //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
-
ManagedStringPool *getManagedStrPool() const {
return const_cast<ManagedStringPool *>(&ManagedStrPool);
}
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index cb8bd72..a8d6b95 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
@@ -47,17 +48,16 @@ class NVVMReflect : public ModulePass {
private:
StringMap<int> VarMap;
typedef DenseMap<std::string, int>::iterator VarMapIter;
- Function *ReflectFunction;
public:
static char ID;
- NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) {
+ NVVMReflect() : ModulePass(ID) {
initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
VarMap.clear();
}
NVVMReflect(const StringMap<int> &Mapping)
- : ModulePass(ID), ReflectFunction(nullptr) {
+ : ModulePass(ID) {
initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
I != E; ++I) {
@@ -70,6 +70,8 @@ public:
}
bool runOnModule(Module &) override;
+private:
+ bool handleFunction(Function *ReflectFunction);
void setVarMap();
};
}
@@ -120,19 +122,7 @@ void NVVMReflect::setVarMap() {
}
}
-bool NVVMReflect::runOnModule(Module &M) {
- if (!NVVMReflectEnabled)
- return false;
-
- setVarMap();
-
- ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
-
- // If reflect function is not used, then there will be
- // no entry in the module.
- if (!ReflectFunction)
- return false;
-
+bool NVVMReflect::handleFunction(Function *ReflectFunction) {
// Validate _reflect function
assert(ReflectFunction->isDeclaration() &&
"_reflect function should not have a body");
@@ -155,13 +145,15 @@ bool NVVMReflect::runOnModule(Module &M) {
"Only one operand expect for _reflect function");
// In cuda, we will have an extra constant-to-generic conversion of
// the string.
- const Value *conv = Reflect->getArgOperand(0);
- assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion");
- const CallInst *ConvCall = cast<CallInst>(conv);
- const Value *str = ConvCall->getArgOperand(0);
- assert(isa<ConstantExpr>(str) &&
+ const Value *Str = Reflect->getArgOperand(0);
+ if (isa<CallInst>(Str)) {
+ // CUDA path
+ const CallInst *ConvCall = cast<CallInst>(Str);
+ Str = ConvCall->getArgOperand(0);
+ }
+ assert(isa<ConstantExpr>(Str) &&
"Format of _reflect function not recognized");
- const ConstantExpr *GEP = cast<ConstantExpr>(str);
+ const ConstantExpr *GEP = cast<ConstantExpr>(Str);
const Value *Sym = GEP->getOperand(0);
assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
@@ -195,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) {
ToRemove[i]->eraseFromParent();
return true;
}
+
+bool NVVMReflect::runOnModule(Module &M) {
+ if (!NVVMReflectEnabled)
+ return false;
+
+ setVarMap();
+
+
+ bool Res = false;
+ std::string Name;
+ Type *Tys[1];
+ Type *I8Ty = Type::getInt8Ty(M.getContext());
+ Function *ReflectFunction;
+
+ // Check for standard overloaded versions of llvm.nvvm.reflect
+
+ for (unsigned i = 0; i != 5; ++i) {
+ Tys[0] = PointerType::get(I8Ty, i);
+ Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
+ ReflectFunction = M.getFunction(Name);
+ if(ReflectFunction != 0) {
+ Res |= handleFunction(ReflectFunction);
+ }
+ }
+
+ ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
+ // If reflect function is not used, then there will be
+ // no entry in the module.
+ if (ReflectFunction != 0)
+ Res |= handleFunction(ReflectFunction);
+
+ return Res;
+}
diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h
index 45cc0b8..02c5a94 100644
--- a/lib/Target/NVPTX/cl_common_defines.h
+++ b/lib/Target/NVPTX/cl_common_defines.h
@@ -1,5 +1,5 @@
-#ifndef __CL_COMMON_DEFINES_H__
-#define __CL_COMMON_DEFINES_H__
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
// This file includes defines that are common to both kernel code and
// the NVPTX back-end.
@@ -119,4 +119,4 @@ typedef enum clk_sampler_type {
#define CLK_LOCAL_MEM_FENCE (1 << 0)
#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-#endif // __CL_COMMON_DEFINES_H__
+#endif // CL_COMMON_DEFINES_H