aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/R600/AMDGPUISelLowering.cpp')
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp480
1 files changed, 345 insertions, 135 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 2f95b74..4707279 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
- TargetLowering(TM) {
-
- Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
-
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+ const AMDGPUSubtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
setOperationAction(ISD::Constant, MVT::i32, Legal);
setOperationAction(ISD::Constant, MVT::i64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -127,12 +125,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FABS, MVT::f32, Legal);
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ // Expand to fneg + fadd.
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -141,9 +148,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
- setOperationAction(ISD::STORE, MVT::i64, Promote);
- AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
-
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -162,9 +166,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// Custom lowering of vector stores is required for local address space
// stores.
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- // XXX: Native v2i32 local address space stores are possible, but not
- // currently implemented.
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
@@ -187,9 +188,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
- setOperationAction(ISD::LOAD, MVT::i64, Promote);
- AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
-
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
@@ -216,18 +214,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+ // There are no 64-bit extloads. These should be done as a 32-bit extload and
+ // an extension to 64-bit.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+ }
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -246,7 +254,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -382,6 +391,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
@@ -397,6 +412,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
// large sequence of instructions.
setIntDivIsCheap(false);
setPow2SDivIsCheap(false);
+ setFsqrtIsCheap(true);
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
@@ -429,6 +445,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
}
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+ ISD::LoadExtType,
+ EVT NewVT) const {
+
+ unsigned NewSize = NewVT.getStoreSizeInBits();
+
+ // If we are reducing to a 32-bit load, this is always better.
+ if (NewSize == 32)
+ return true;
+
+ EVT OldVT = N->getValueType(0);
+ unsigned OldSize = OldVT.getStoreSizeInBits();
+
+ // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+ // extloads, so doing one requires using a buffer_load. In cases where we
+ // still couldn't use a scalar load, using the wider load shouldn't really
+ // hurt anything.
+
+ // If the old size already had to be an extload, there's no harm in continuing
+ // to reduce the width.
+ return (OldSize < 32);
+}
+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -442,6 +481,18 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
(LScalarSize < 32));
}
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+ return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+ return true;
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -560,6 +611,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@@ -619,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
const SDValue &InitPtr,
SDValue Chain,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+ const DataLayout *TD = getDataLayout();
SDLoc DL(InitPtr);
Type *InitTy = Init->getType();
@@ -707,7 +759,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
SDValue Op,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+ const DataLayout *TD = getDataLayout();
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
@@ -810,8 +862,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
- getTargetMachine().getSubtargetImpl()->getFrameLowering());
+ const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
@@ -866,10 +917,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::AMDGPU_div_fmas:
- // FIXME: Dropping bool parameter. Work is needed to support the implicit
- // read from VCC.
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
case Intrinsic::AMDGPU_div_fixup:
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
@@ -889,7 +939,19 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::AMDGPU_rsq_clamped:
- return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ Type *Type = VT.getTypeForEVT(*DAG.getContext());
+ APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+ SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+ DAG.getConstantFP(Max, VT));
+ return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+ DAG.getConstantFP(Min, VT));
+ } else {
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ }
case Intrinsic::AMDGPU_ldexp:
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
@@ -962,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case AMDGPUIntrinsic::AMDGPU_brev:
return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+ case Intrinsic::AMDGPU_class:
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
@@ -1000,17 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
}
/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
+
if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
switch (CCOpcode) {
case ISD::SETOEQ:
@@ -1027,27 +1097,47 @@ SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
case ISD::SETO:
break;
case ISD::SETULE:
- case ISD::SETULT:
+ case ISD::SETULT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ }
case ISD::SETOLE:
case ISD::SETOLT:
case ISD::SETLE:
case ISD::SETLT: {
+ // Ordered. Assume ordered for undefined.
+
+ // Only do this after legalization to avoid interfering with other combines
+ // which might occur.
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
// We need to permute the operands to get the correct NaN behavior. The
// selected operand is the second one based on the failing compare with NaN,
// so permute it based on the compare type the hardware uses.
if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
- return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ }
+ case ISD::SETUGE:
+ case ISD::SETUGT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
}
case ISD::SETGT:
case ISD::SETGE:
- case ISD::SETUGE:
case ISD::SETOGE:
- case ISD::SETUGT:
case ISD::SETOGT: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
if (LHS == True)
- return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
- return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
}
case ISD::SETCC_INVALID:
llvm_unreachable("Invalid setcc condcode!");
@@ -1330,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
EVT MemVT = Load->getMemoryVT();
- if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
- // We can do the extload to 32-bits, and then need to separately extend to
- // 64-bits.
-
- SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
- Load->getChain(),
- Load->getBasePtr(),
- MemVT,
- Load->getMemOperand());
-
- SDValue Ops[] = {
- DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
- ExtLoad32.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, DL);
- }
-
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
assert(VT == MVT::i1 && "Only i1 non-extloads expected");
// FIXME: Copied from PPC
@@ -1586,12 +1658,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+ if (VT == MVT::i64 &&
+ DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+ DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+ SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+
+ SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
+ Results.push_back(DIV);
+ Results.push_back(REM);
+ return;
+ }
+
// Get Speculative values
SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
- SDValue REM_Hi = zero;
SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
SDValue DIV_Lo = zero;
@@ -1599,8 +1685,10 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
const unsigned halfBitWidth = HalfVT.getSizeInBits();
for (unsigned i = 0; i < halfBitWidth; ++i) {
- SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
- // Get Value of high bit
+ const unsigned bitPos = halfBitWidth - i - 1;
+ SDValue POS = DAG.getConstant(bitPos, HalfVT);
+ // Get value of high bit
+ // TODO: Remove the BFE part when the optimization is fixed
SDValue HBit;
if (halfBitWidth == 32 && Subtarget->hasBFE()) {
HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
@@ -1608,33 +1696,23 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
}
+ HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
- SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
- DAG.getConstant(halfBitWidth - 1, HalfVT));
- REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
- REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
- REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
- REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+ // Shift
+ REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT));
+ // Add LHS high bit
+ REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
-
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
- SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+ SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT);
SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
// Update REM
-
SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
- REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
- REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
}
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
Results.push_back(DIV);
Results.push_back(REM);
@@ -1655,8 +1733,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
SDValue Den = Op.getOperand(1);
if (VT == MVT::i32) {
- if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
- DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+ if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
+ DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
// TODO: We technically could do this for i64, but shouldn't that just be
// handled by something generally reducing 64-bit division on 32-bit
// values to 32-bit?
@@ -1768,19 +1846,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- if (VT == MVT::i32) {
- if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
- DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
- // TODO: We technically could do this for i64, but shouldn't that just be
- // handled by something generally reducing 64-bit division on 32-bit
- // values to 32-bit?
- return LowerDIVREM24(Op, DAG, true);
- }
- }
-
SDValue Zero = DAG.getConstant(0, VT);
SDValue NegOne = DAG.getConstant(-1, VT);
+ if (VT == MVT::i32 &&
+ DAG.ComputeNumSignBits(LHS) > 8 &&
+ DAG.ComputeNumSignBits(RHS) > 8) {
+ return LowerDIVREM24(Op, DAG, true);
+ }
+ if (VT == MVT::i64 &&
+ DAG.ComputeNumSignBits(LHS) > 32 &&
+ DAG.ComputeNumSignBits(RHS) > 32) {
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+ //HiLo split
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+ SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+ LHS_Lo, RHS_Lo);
+ SDValue Res[2] = {
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+ DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+ };
+ return DAG.getMergeValues(Res, DL);
+ }
+
SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1845,6 +1935,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+
+ SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ Hi,
+ DAG.getConstant(FractBits - 32, MVT::i32),
+ DAG.getConstant(ExpBits, MVT::i32));
+ SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+ DAG.getConstant(1023, MVT::i32));
+
+ return Exp;
+}
+
SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -1860,16 +1964,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
// exponent.
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
- const unsigned FractBits = 52;
- const unsigned ExpBits = 11;
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
- // Extract the exponent.
- SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
- Hi,
- DAG.getConstant(FractBits - 32, MVT::i32),
- DAG.getConstant(ExpBits, MVT::i32));
- SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
- DAG.getConstant(1023, MVT::i32));
+ const unsigned FractBits = 52;
// Extract the sign bit.
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@@ -1932,6 +2029,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
}
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+ SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+ SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+ const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
+ const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
+
+ SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+ SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+ SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+ return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+
+ SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+ const SDValue Zero = DAG.getConstant(0, MVT::i32);
+ const SDValue One = DAG.getConstant(1, MVT::i32);
+ const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
+ const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+ SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+ const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
+
+ SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+ SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+ DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
+ Exp);
+
+ SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+ SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+ DAG.getConstant(0, MVT::i64), Tmp0,
+ ISD::SETNE);
+
+ SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+ D, DAG.getConstant(0, MVT::i64));
+ SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+ K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+ K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+ SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+ SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+ SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+ SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+ ExpEqNegOne,
+ DAG.getConstantFP(1.0, MVT::f64),
+ DAG.getConstantFP(0.0, MVT::f64));
+
+ SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+ K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+ return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::f32)
+ return LowerFROUND32(Op, DAG);
+
+ if (VT == MVT::f64)
+ return LowerFROUND64(Op, DAG);
+
+ llvm_unreachable("unhandled type");
+}
+
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2155,7 +2345,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SDValue Value = SN->getValue();
EVT VT = Value.getValueType();
- if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+ if (isTypeLegal(VT) || SN->isVolatile() ||
+ !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
return SDValue();
LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
@@ -2231,27 +2422,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
simplifyI24(N1, DCI);
return SDValue();
}
- case ISD::SELECT_CC: {
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::f32 ||
- (VT == MVT::f64 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- SDValue True = N->getOperand(2);
- SDValue False = N->getOperand(3);
- SDValue CC = N->getOperand(4);
-
- return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
-
- break;
- }
case ISD::SELECT: {
SDValue Cond = N->getOperand(0);
- if (Cond.getOpcode() == ISD::SETCC) {
+ if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue LHS = Cond.getOperand(0);
@@ -2261,11 +2434,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
SDValue True = N->getOperand(1);
SDValue False = N->getOperand(2);
- if (VT == MVT::f32 ||
- (VT == MVT::f64 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
- return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
+ if (VT == MVT::f32)
+ return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
// TODO: Implement min / max Evergreen instructions.
if (VT == MVT::i32 &&
@@ -2451,7 +2621,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(MAD)
NODE_NAME_CASE(FMAX_LEGACY)
NODE_NAME_CASE(SMAX)
NODE_NAME_CASE(UMAX)
@@ -2474,6 +2643,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ_LEGACY)
NODE_NAME_CASE(RSQ_CLAMPED)
NODE_NAME_CASE(LDEXP)
+ NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)
@@ -2505,6 +2675,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
}
}
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rsq instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ // Reciprocal, < 1 ulp error.
+ //
+ // This reciprocal approximation converges to < 0.5 ulp error with one
+ // newton rhapson performed with two fused multiple adds (FMAs).
+
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rcp instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
static void computeKnownBitsForMinMax(const SDValue Op0,
const SDValue Op1,
APInt &KnownZero,