diff options
Diffstat (limited to 'lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | lib/Target/R600/SIISelLowering.cpp | 409 |
1 files changed, 254 insertions, 155 deletions
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 4c672ca..0a0fbd9 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -26,21 +26,22 @@ using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM), - TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) { + TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())), + TRI(TM.getRegisterInfo()) { addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass); - addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass); + addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); computeRegisterProperties(); - setOperationAction(ISD::AND, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); @@ -62,63 +63,13 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - const TargetInstrInfo * TII = getTargetMachine().getInstrInfo(); MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); MachineBasicBlock::iterator I = MI; - if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) { - AppendS_WAITCNT(MI, *BB, llvm::next(I)); - return BB; - } - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::CLAMP_SI: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - // VSRC1-2 are unused, but we still need to fill all the - // operand slots, so we just reuse the VSRC0 operand - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(1)) - .addImm(0) // ABS - .addImm(1) // CLAMP - .addImm(0) // OMOD - .addImm(0); // NEG - MI->eraseFromParent(); - break; - - case AMDGPU::FABS_SI: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - // VSRC1-2 are unused, but we still need to fill all the - // operand slots, so we just reuse the VSRC0 operand - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(1)) - .addImm(1) // ABS - .addImm(0) // CLAMP - .addImm(0) // OMOD - .addImm(0); // NEG - MI->eraseFromParent(); - break; - - case AMDGPU::FNEG_SI: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - // VSRC1-2 are unused, but we still need to fill all the - // operand slots, so we just reuse the VSRC0 operand - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(1)) - .addImm(0) // ABS - .addImm(0) // CLAMP - .addImm(0) // OMOD - .addImm(1); // NEG - MI->eraseFromParent(); - break; case AMDGPU::SHADER_TYPE: BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType = MI->getOperand(0).getImm(); @@ -128,29 +79,13 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::SI_INTERP: LowerSI_INTERP(MI, *BB, I, MRI); break; - case AMDGPU::SI_INTERP_CONST: - LowerSI_INTERP_CONST(MI, *BB, I, MRI); - break; - case AMDGPU::SI_KIL: - LowerSI_KIL(MI, *BB, I, MRI); - break; case AMDGPU::SI_WQM: LowerSI_WQM(MI, *BB, I, MRI); break; - case AMDGPU::SI_V_CNDLT: - LowerSI_V_CNDLT(MI, *BB, I, MRI); - break; } return BB; } -void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I) const { - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); -} - - void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) @@ -190,57 +125,6 @@ void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, MI->eraseFromParent(); } -void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI, - MachineBasicBlock &BB, MachineBasicBlock::iterator I, - MachineRegisterInfo &MRI) const { - MachineOperand dst = MI->getOperand(0); - MachineOperand attr_chan = MI->getOperand(1); - MachineOperand attr = MI->getOperand(2); - MachineOperand params = MI->getOperand(3); - unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) - .addOperand(params); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32)) - .addOperand(dst) - .addOperand(attr_chan) - .addOperand(attr) - .addReg(M0); - - MI->eraseFromParent(); -} - -void SITargetLowering::LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { - // Clear this pixel from the exec mask if the operand is negative - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMPX_LE_F32_e32), - AMDGPU::VCC) - .addReg(AMDGPU::SREG_LIT_0) - .addOperand(MI->getOperand(0)); - - MI->eraseFromParent(); -} - -void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { - unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - - BuildMI(BB, I, BB.findDebugLoc(I), - TII->get(AMDGPU::V_CMP_GT_F32_e32), - VCC) - .addReg(AMDGPU::SREG_LIT_0) - .addOperand(MI->getOperand(1)); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(2)) - .addReg(VCC); - - MI->eraseFromParent(); -} - EVT SITargetLowering::getSetCCResultType(EVT VT) const { return MVT::i1; } @@ -255,7 +139,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -272,30 +155,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -/// \brief The function is for lowering i1 operations on the -/// VCC register. -/// -/// In the VALU context, VCC is a one bit register, but in the -/// SALU context the VCC is a 64-bit register (1-bit per thread). Since only -/// the SALU can perform operations on the VCC register, we need to promote -/// the operand types from i1 to i64 in order for tablegen to be able to match -/// this operation to the correct SALU instruction. We do this promotion by -/// wrapping the operands in a CopyToReg node. -/// -SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op, - SelectionDAG &DAG, - unsigned VCCNode) const { - DebugLoc DL = Op.getDebugLoc(); - - SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64, - DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, - Op.getOperand(0)), - DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, - Op.getOperand(1))); - - return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode); -} - /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -500,12 +359,252 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } -#define NODE_NAME_CASE(node) case SIISD::node: return #node; +/// \brief Test if RegClass is one of the VSrc classes +static bool isVSrc(unsigned RegClass) { + return AMDGPU::VSrc_32RegClassID == RegClass || + AMDGPU::VSrc_64RegClassID == RegClass; +} + +/// \brief Test if RegClass is one of the SSrc classes +static bool isSSrc(unsigned RegClass) { + return AMDGPU::SSrc_32RegClassID == RegClass || + AMDGPU::SSrc_64RegClassID == RegClass; +} + +/// \brief Analyze the possible immediate value Op +/// +/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate +/// and the immediate value if it's a literal immediate +int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { + + union { + int32_t I; + float F; + } Imm; + + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) + Imm.I = Node->getSExtValue(); + else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + Imm.F = Node->getValueAPF().convertToFloat(); + else + return -1; // It isn't an immediate + + if ((Imm.I >= -16 && Imm.I <= 64) || + Imm.F == 0.5f || Imm.F == -0.5f || + Imm.F == 1.0f || Imm.F == -1.0f || + Imm.F == 2.0f || Imm.F == -2.0f || + Imm.F == 4.0f || Imm.F == -4.0f) + return 0; // It's an inline immediate + + return Imm.I; // It's a literal immediate +} + +/// \brief Try to fold an immediate directly into an instruction +bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, + bool &ScalarSlotUsed) const { + + MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); + if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) + return false; + + const SDValue &Op = Mov->getOperand(0); + int32_t Value = analyzeImmediate(Op.getNode()); + if (Value == -1) { + // Not an immediate at all + return false; + + } else if (Value == 0) { + // Inline immediates can always be fold + Operand = Op; + return true; + + } else if (Value == Immediate) { + // Already fold literal immediate + Operand = Op; + return true; + + } else if (!ScalarSlotUsed && !Immediate) { + // Fold this literal immediate + ScalarSlotUsed = true; + Immediate = Value; + Operand = Op; + return true; -const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: return AMDGPUTargetLowering::getTargetNodeName(Opcode); - NODE_NAME_CASE(VCC_AND) - NODE_NAME_CASE(VCC_BITCAST) } + + return false; +} + +/// \brief Does "Op" fit into register class "RegClass" ? +bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op, + unsigned RegClass) const { + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + SDNode *Node = Op.getNode(); + + int OpClass; + if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { + const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); + OpClass = Desc.OpInfo[Op.getResNo()].RegClass; + + } else if (Node->getOpcode() == ISD::CopyFromReg) { + RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); + OpClass = MRI.getRegClass(Reg->getReg())->getID(); + + } else + return false; + + if (OpClass == -1) + return false; + + return TRI->getRegClass(RegClass)->hasSubClassEq(TRI->getRegClass(OpClass)); +} + +/// \brief Make sure that we don't exeed the number of allowed scalars +void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, + unsigned RegClass, + bool &ScalarSlotUsed) const { + + // First map the operands register class to a destination class + if (RegClass == AMDGPU::VSrc_32RegClassID) + RegClass = AMDGPU::VReg_32RegClassID; + else if (RegClass == AMDGPU::VSrc_64RegClassID) + RegClass = AMDGPU::VReg_64RegClassID; + else + return; + + // Nothing todo if they fit naturaly + if (fitsRegClass(DAG, Operand, RegClass)) + return; + + // If the scalar slot isn't used yet use it now + if (!ScalarSlotUsed) { + ScalarSlotUsed = true; + return; + } + + // This is a conservative aproach, it is possible that we can't determine + // the correct register class and copy too often, but better save than sorry. + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(), + Operand.getValueType(), Operand, RC); + Operand = SDValue(Node, 0); +} + +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + + // Original encoding (either e32 or e64) + int Opcode = Node->getMachineOpcode(); + const MCInstrDesc *Desc = &TII->get(Opcode); + + unsigned NumDefs = Desc->getNumDefs(); + unsigned NumOps = Desc->getNumOperands(); + + // e64 version if available, -1 otherwise + int OpcodeE64 = AMDGPU::getVOPe64(Opcode); + const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); + + assert(!DescE64 || DescE64->getNumDefs() == NumDefs); + assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); + + int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; + bool HaveVSrc = false, HaveSSrc = false; + + // First figure out what we alread have in this instruction + for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; + i != e && Op < NumOps; ++i, ++Op) { + + unsigned RegClass = Desc->OpInfo[Op].RegClass; + if (isVSrc(RegClass)) + HaveVSrc = true; + else if (isSSrc(RegClass)) + HaveSSrc = true; + else + continue; + + int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); + if (Imm != -1 && Imm != 0) { + // Literal immediate + Immediate = Imm; + } + } + + // If we neither have VSrc nor SSrc it makes no sense to continue + if (!HaveVSrc && !HaveSSrc) + return Node; + + // No scalar allowed when we have both VSrc and SSrc + bool ScalarSlotUsed = HaveVSrc && HaveSSrc; + + // Second go over the operands and try to fold them + std::vector<SDValue> Ops; + bool Promote2e64 = false; + for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; + i != e && Op < NumOps; ++i, ++Op) { + + const SDValue &Operand = Node->getOperand(i); + Ops.push_back(Operand); + + // Already folded immediate ? + if (isa<ConstantSDNode>(Operand.getNode()) || + isa<ConstantFPSDNode>(Operand.getNode())) + continue; + + // Is this a VSrc or SSrc operand ? + unsigned RegClass = Desc->OpInfo[Op].RegClass; + if (!isVSrc(RegClass) && !isSSrc(RegClass)) { + + if (i == 1 && Desc->isCommutable() && + fitsRegClass(DAG, Ops[0], RegClass) && + foldImm(Ops[1], Immediate, ScalarSlotUsed)) { + + assert(isVSrc(Desc->OpInfo[NumDefs].RegClass) || + isSSrc(Desc->OpInfo[NumDefs].RegClass)); + + // Swap commutable operands + SDValue Tmp = Ops[1]; + Ops[1] = Ops[0]; + Ops[0] = Tmp; + + } else if (DescE64 && !Immediate) { + // Test if it makes sense to switch to e64 encoding + + RegClass = DescE64->OpInfo[Op].RegClass; + int32_t TmpImm = -1; + if ((isVSrc(RegClass) || isSSrc(RegClass)) && + foldImm(Ops[i], TmpImm, ScalarSlotUsed)) { + + Immediate = -1; + Promote2e64 = true; + Desc = DescE64; + DescE64 = 0; + } + } + continue; + } + + // Try to fold the immediates + if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { + // Folding didn't worked, make sure we don't hit the SReg limit + ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); + } + } + + if (Promote2e64) { + // Add the modifier flags while promoting + for (unsigned i = 0; i < 4; ++i) + Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); + } + + // Add optional chain and glue + for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) + Ops.push_back(Node->getOperand(i)); + + // Either create a complete new or update the current instruction + if (Promote2e64) + return DAG.getMachineNode(OpcodeE64, Node->getDebugLoc(), + Node->getVTList(), Ops.data(), Ops.size()); + else + return DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); } |