diff options
author | Stephen Hines <srhines@google.com> | 2015-04-01 18:49:24 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2015-04-01 18:49:26 +0000 |
commit | 3fa16bd6062e23bcdb82ed4dd965674792e6b761 (patch) | |
tree | 9348fc507292f7e8715d22d64ce5a32131b4f875 /lib/Target/R600/SIISelLowering.cpp | |
parent | beed47390a60f6f0c77532b3d3f76bb47ef49423 (diff) | |
parent | ebe69fe11e48d322045d5949c83283927a0d790b (diff) | |
download | external_llvm-3fa16bd6062e23bcdb82ed4dd965674792e6b761.zip external_llvm-3fa16bd6062e23bcdb82ed4dd965674792e6b761.tar.gz external_llvm-3fa16bd6062e23bcdb82ed4dd965674792e6b761.tar.bz2 |
Merge "Update aosp/master LLVM for rebase to r230699."
Diffstat (limited to 'lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | lib/Target/R600/SIISelLowering.cpp | 867 |
1 files changed, 379 insertions, 488 deletions
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 8d4164a..7d794b8 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -35,8 +35,9 @@ using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM) : - AMDGPUTargetLowering(TM) { +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -44,7 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); @@ -59,22 +60,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - computeRegisterProperties(); - - // Condition Codes - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - - setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + computeRegisterProperties(STI.getRegisterInfo()); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); @@ -104,12 +90,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setOperationAction(ISD::SELECT, MVT::f32, Promote); - AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -147,26 +129,34 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); @@ -213,13 +203,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } } - for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { - MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -228,6 +211,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); @@ -235,7 +219,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); - + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); // All memory operations. Some folding on the pointer operand is done to help @@ -315,7 +300,7 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, bool *IsFast) const { @@ -327,9 +312,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // XXX - CI changes say "Support for unaligned memory accesses" but I don't - // see what for specifically. The wording everywhere else seems to be the - // same. + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the @@ -341,12 +325,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return Align % 4 == 0; } + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. // This applies to private, global, and constant memory. if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32); + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; } EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, @@ -379,8 +369,8 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); return TII->isInlineConstant(Imm); } @@ -413,16 +403,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, } SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - - const TargetMachine &TM = getTargetMachine(); + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); @@ -461,7 +446,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned j = 0; j != NumElements; ++j) { @@ -489,7 +474,10 @@ SDValue SITargetLowering::LowerFormalArguments( // The pointer to the list of arguments is stored in SGPR0, SGPR1 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { - Info->NumUserSGPRs = 4; + if (Subtarget->isAmdHsaOS()) + Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + else + Info->NumUserSGPRs = 4; unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); @@ -541,7 +529,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); const PointerType *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always @@ -576,7 +564,7 @@ SDValue SITargetLowering::LowerFormalArguments( if (Arg.VT.isVector()) { // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); SmallVector<SDValue, 4> Regs; @@ -589,8 +577,7 @@ SDValue SITargetLowering::LowerFormalArguments( // Fill up the missing vector elements NumElements = Arg.VT.getVectorNumElements() - NumElements; - for (unsigned j = 0; j != NumElements; ++j) - Regs.push_back(DAG.getUNDEF(VT)); + Regs.append(NumElements, DAG.getUNDEF(VT)); InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); continue; @@ -598,6 +585,12 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + + if (Info->getShaderType() != ShaderType::COMPUTE) { + unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); + Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + } return Chain; } @@ -605,25 +598,14 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::BRANCH: return BB; - case AMDGPU::V_SUB_F64: { - unsigned DestReg = MI->getOperand(0).getReg(); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) - .addImm(0) // SRC0 modifiers - .addReg(MI->getOperand(1).getReg()) - .addImm(1) // SRC1 modifiers - .addReg(MI->getOperand(2).getReg()) - .addImm(0) // CLAMP - .addImm(0); // OMOD - MI->eraseFromParent(); - break; - } + case AMDGPU::BRANCH: + return BB; case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -640,17 +622,43 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return BB; } -EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + +EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { return MVT::i1; } - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); @@ -659,7 +667,11 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: - return false; /* There is V_MAD_F32 for f32 */ + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); case MVT::f64: return true; default: @@ -755,15 +767,12 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); // Build the result and - SmallVector<EVT, 4> Res; - for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) - Res.push_back(Intr->getValueType(i)); + ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); // operands of the new intrinsic call SmallVector<SDValue, 4> Ops; Ops.push_back(BRCOND.getOperand(0)); - for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) - Ops.push_back(Intr->getOperand(i)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); Ops.push_back(Target); // build the new intrinsic call @@ -839,7 +848,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -889,13 +898,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { @@ -1090,7 +1099,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const APFloat K1Val(BitsToFloat(0x2f800000)); const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); - const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); @@ -1108,7 +1117,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { @@ -1129,11 +1201,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Store->getMemoryVT(); // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - VT.isVector() && VT.getVectorNumElements() == 2 && - VT.getVectorElementType() == MVT::i32) - return SDValue(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (VT.isVector() && VT.getVectorNumElements() > 4) return ScalarizeVectorStore(Op, DAG); @@ -1177,7 +1244,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) { + DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); if (ScalarVT != MVT::f32) @@ -1225,8 +1292,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast<LoadSDNode>(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, Load->getChain(), Load->getBasePtr(), @@ -1297,8 +1377,8 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!CAdd) return SDValue(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); // If the resulting offset is too large, we can't fold it into the addressing // mode offset. @@ -1316,6 +1396,102 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + X, DAG.getConstant(Mask, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + Src, DAG.getConstant(NewMask, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, MVT::i1); + } + + return SDValue(); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -1371,33 +1547,47 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, + LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32)); + } + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); - EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SETCC: { - SDValue Arg0 = N->getOperand(0); - SDValue Arg1 = N->getOperand(1); - SDValue CC = N->getOperand(2); - ConstantSDNode * C = nullptr; - ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); - - // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) - if (VT == MVT::i1 - && Arg0.getOpcode() == ISD::SIGN_EXTEND - && Arg0.getOperand(0).getValueType() == MVT::i1 - && (C = dyn_cast<ConstantSDNode>(Arg1)) - && C->isNullValue() - && CCOp == ISD::SETNE) { - return SimplifySetCC(VT, Arg0.getOperand(0), - DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); - } - break; - } + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); case ISD::FMAXNUM: // TODO: What about fmax_legacy? case ISD::FMINNUM: case AMDGPUISD::SMAX: @@ -1442,6 +1632,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::f32) break; + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -1452,8 +1647,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); } } @@ -1461,12 +1656,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); } } - break; + return SDValue(); } case ISD::FSUB: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -1476,39 +1671,22 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, // Try to get the fneg to fold into the source modifier. This undoes generic // DAG combines and folds them into the mad. - if (VT == MVT::f32) { + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::FMUL) { - // (fsub (fmul a, b), c) -> mad a, b, (fneg c) - - SDValue A = LHS.getOperand(0); - SDValue B = LHS.getOperand(1); - SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - - if (RHS.getOpcode() == ISD::FMUL) { - // (fsub c, (fmul a, b)) -> mad (fneg a), b, c - - SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); - SDValue B = RHS.getOperand(1); - SDValue C = LHS; - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - if (LHS.getOpcode() == ISD::FADD) { // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32); + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); } } @@ -1517,10 +1695,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); } } + + return SDValue(); } break; @@ -1554,9 +1734,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); if (NewPtr) { - SmallVector<SDValue, 8> NewOps; - for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) - NewOps.push_back(MemNode->getOperand(I)); + SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); @@ -1564,287 +1742,44 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Test if RegClass is one of the VSrc classes -static bool isVSrc(unsigned RegClass) { - switch(RegClass) { - default: return false; - case AMDGPU::VSrc_32RegClassID: - case AMDGPU::VCSrc_32RegClassID: - case AMDGPU::VSrc_64RegClassID: - case AMDGPU::VCSrc_64RegClassID: - return true; - } -} - -/// \brief Test if RegClass is one of the SSrc classes -static bool isSSrc(unsigned RegClass) { - return AMDGPU::SSrc_32RegClassID == RegClass || - AMDGPU::SSrc_64RegClassID == RegClass; -} - /// \brief Analyze the possible immediate value Op /// /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - union { - int32_t I; - float F; - } Imm; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { - if (Node->getZExtValue() >> 32) { - return -1; - } - Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { - if (N->getValueType(0) != MVT::f32) - return -1; - Imm.F = Node->getValueAPF().convertToFloat(); - } else - return -1; // It isn't an immediate - - if ((Imm.I >= -16 && Imm.I <= 64) || - Imm.F == 0.5f || Imm.F == -0.5f || - Imm.F == 1.0f || Imm.F == -1.0f || - Imm.F == 2.0f || Imm.F == -2.0f || - Imm.F == 4.0f || Imm.F == -4.0f) - return 0; // It's an inline immediate - - return Imm.I; // It's a literal immediate -} - -/// \brief Try to fold an immediate directly into an instruction -bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, - bool &ScalarSlotUsed) const { - - MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - if (!Mov || !TII->isMov(Mov->getMachineOpcode())) - return false; - - const SDValue &Op = Mov->getOperand(0); - int32_t Value = analyzeImmediate(Op.getNode()); - if (Value == -1) { - // Not an immediate at all - return false; - - } else if (Value == 0) { - // Inline immediates can always be fold - Operand = Op; - return true; - - } else if (Value == Immediate) { - // Already fold literal immediate - Operand = Op; - return true; - - } else if (!ScalarSlotUsed && !Immediate) { - // Fold this literal immediate - ScalarSlotUsed = true; - Immediate = Value; - Operand = Op; - return true; + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; } - return false; -} + if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; -const TargetRegisterClass *SITargetLowering::getRegClassForNode( - SelectionDAG &DAG, const SDValue &Op) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - if (!Op->isMachineOpcode()) { - switch(Op->getOpcode()) { - case ISD::CopyFromReg: { - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - return MRI.getRegClass(Reg); - } - return TRI.getPhysRegClass(Reg); - } - default: return nullptr; - } - } - const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); - int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; - if (OpClassID != -1) { - return TRI.getRegClass(OpClassID); - } - switch(Op.getMachineOpcode()) { - case AMDGPU::COPY_TO_REGCLASS: - // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. - OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); - - // If the COPY_TO_REGCLASS instruction is copying to a VSrc register - // class, then the register class for the value could be either a - // VReg or and SReg. In order to get a more accurate - if (isVSrc(OpClassID)) - return getRegClassForNode(DAG, Op.getOperand(0)); - - return TRI.getRegClass(OpClassID); - case AMDGPU::EXTRACT_SUBREG: { - int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - const TargetRegisterClass *SuperClass = - getRegClassForNode(DAG, Op.getOperand(0)); - return TRI.getSubClassWithSubReg(SuperClass, SubIdx); - } - case AMDGPU::REG_SEQUENCE: - // Operand 0 is the register class id for REG_SEQUENCE instructions. - return TRI.getRegClass( - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); - default: - return getRegClassFor(Op.getSimpleValueType()); - } -} + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); -/// \brief Does "Op" fit into register class "RegClass" ? -bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, - unsigned RegClass) const { - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); - if (!RC) { - return false; + return -1; } - return TRI->getRegClass(RegClass)->hasSubClassEq(RC); -} -/// \returns true if \p Node's operands are different from the SDValue list -/// \p Ops -static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { - for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { - if (Ops[i].getNode() != Node->getOperand(i).getNode()) { - return true; - } - } - return false; -} - -/// TODO: This needs to be removed. It's current primary purpose is to fold -/// immediates into operands when legal. The legalization parts are redundant -/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook. -SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node, - SelectionDAG &DAG) const { - // Original encoding (either e32 or e64) - int Opcode = Node->getMachineOpcode(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - const MCInstrDesc *Desc = &TII->get(Opcode); - - unsigned NumDefs = Desc->getNumDefs(); - unsigned NumOps = Desc->getNumOperands(); - - // Commuted opcode if available - int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); - - assert(!DescRev || DescRev->getNumDefs() == NumDefs); - assert(!DescRev || DescRev->getNumOperands() == NumOps); - - int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; - bool HaveVSrc = false, HaveSSrc = false; - - // First figure out what we already have in this instruction. - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass)) - HaveVSrc = true; - else if (isSSrc(RegClass)) - HaveSSrc = true; - else - continue; - - int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); - if (Imm != -1 && Imm != 0) { - // Literal immediate - Immediate = Imm; - } - } - - // If we neither have VSrc nor SSrc, it makes no sense to continue. - if (!HaveVSrc && !HaveSSrc) - return Node; - - // No scalar allowed when we have both VSrc and SSrc - bool ScalarSlotUsed = HaveVSrc && HaveSSrc; - - // If this instruction has an implicit use of VCC, then it can't use the - // constant bus. - for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) { - if (Desc->ImplicitUses[i] == AMDGPU::VCC) { - ScalarSlotUsed = true; - break; - } - } - - // Second go over the operands and try to fold them - std::vector<SDValue> Ops; - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - const SDValue &Operand = Node->getOperand(i); - Ops.push_back(Operand); - - // Already folded immediate? - if (isa<ConstantSDNode>(Operand.getNode()) || - isa<ConstantFPSDNode>(Operand.getNode())) - continue; - - // Is this a VSrc or SSrc operand? - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass) || isSSrc(RegClass)) { - // Try to fold the immediates. If this ends up with multiple constant bus - // uses, it will be legalized later. - foldImm(Ops[i], Immediate, ScalarSlotUsed); - continue; - } - - if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { - - unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; - assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); - - // Test if it makes sense to swap operands - if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || - (!fitsRegClass(DAG, Ops[1], RegClass) && - fitsRegClass(DAG, Ops[1], OtherRegClass))) { - - // Swap commutable operands - std::swap(Ops[0], Ops[1]); - - Desc = DescRev; - DescRev = nullptr; - continue; - } - } - } - - // Add optional chain and glue - for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) - Ops.push_back(Node->getOperand(i)); - - // Nodes that have a glue result are not CSE'd by getMachineNode(), so in - // this case a brand new node is always be created, even if the operands - // are the same as before. So, manually check if anything has been changed. - if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { - return Node; - } - - // Create a complete new instruction - return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); + return -1; } /// \brief Helper function for adjustWritemask @@ -1904,14 +1839,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector<SDValue> Ops; Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); - for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) - Ops.push_back(Node->getOperand(i)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), SDValue(Node, 0), RC); @@ -1963,9 +1897,8 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); - Node = AdjustRegClass(Node, DAG); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); @@ -1975,17 +1908,17 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, legalizeTargetIndependentNode(Node, DAG); return Node; } - - return legalizeOperands(Node, DAG); + return Node; } /// \brief Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); TII->legalizeOperands(MI); if (TII->isMIMG(MI->getOpcode())) { @@ -1998,14 +1931,13 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, const TargetRegisterClass *RC; switch (BitsSet) { default: return; - case 1: RC = &AMDGPU::VReg_32RegClass; break; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; case 2: RC = &AMDGPU::VReg_64RegClass; break; case 3: RC = &AMDGPU::VReg_96RegClass; break; } unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); MI->setDesc(TII->get(NewOpcode)); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); return; } @@ -2030,6 +1962,8 @@ static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); #if 1 // XXX - Workaround for moveToVALU not handling different register class // inserts for REG_SEQUENCE. @@ -2039,7 +1973,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), buildSMovImm32(DAG, DL, 0), DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), - buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) }; @@ -2063,7 +1997,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), buildSMovImm32(DAG, DL, 0), DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) }; @@ -2110,57 +2044,14 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size return buildRSRC(DAG, DL, Ptr, 0, Rsrc); } -MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, - SelectionDAG &DAG) const { - - SDLoc DL(N); - unsigned NewOpcode = N->getMachineOpcode(); - - switch (N->getMachineOpcode()) { - default: return N; - case AMDGPU::S_LOAD_DWORD_IMM: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - // Fall-through - case AMDGPU::S_LOAD_DWORDX2_SGPR: - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - } - // Fall-through - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { - if (NewOpcode == N->getMachineOpcode()) { - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - } - if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { - return N; - } - ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); - - const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64); - SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0); - MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr); - - SmallVector<SDValue, 8> Ops; - Ops.push_back(SDValue(RSrc, 0)); - Ops.push_back(N->getOperand(0)); - Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)); - - // Copy remaining operands so we keep any chain and glue nodes that follow - // the normal operands. - for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) - Ops.push_back(N->getOperand(I)); - - return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); - } - } -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { |