diff options
Diffstat (limited to 'lib/Target/R600')
43 files changed, 1561 insertions, 1035 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index ba87918..e099a9f 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,11 +23,9 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); -FunctionPass *createR600LowerConstCopy(TargetMachine &tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); -FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 40f4741..1a26c77 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -38,3 +38,4 @@ include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPUInstructions.td" +include "AMDGPUCallingConv.td" diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index c30dbe4..f600144 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -141,5 +141,5 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); OutStreamer.EmitIntValue(MaxSGPR + 1, 4); OutStreamer.EmitIntValue(MaxVGPR + 1, 4); - OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4); + OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); } diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td new file mode 100644 index 0000000..45ae37e --- /dev/null +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -0,0 +1,42 @@ +//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMD Radeon GPUs. +// +//===----------------------------------------------------------------------===// + +// Inversion of CCIfInReg +class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} + +// Calling convention for SI +def CC_SI : CallingConv<[ + + CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15 + ]>>>, + + CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ] + >>>, + + CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 + ]>>> + +]>; + +def CC_AMDGPU : CallingConv<[ + CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"# + "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>> +]>; diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 0a33264..5995b6f 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -14,7 +14,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPUISelLowering.h" +#include "AMDGPURegisterInfo.h" #include "AMDILIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -22,6 +25,8 @@ using namespace llvm; +#include "AMDGPUGenCallingConv.inc" + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { @@ -64,17 +69,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // TargetLowering Callbacks //===---------------------------------------------------------------------===// -SDValue AMDGPUTargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - InVals.push_back(SDValue()); - } - return Chain; +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + + State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } SDValue AMDGPUTargetLowering::LowerReturn( diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 9e7d997..f31b646 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -39,15 +39,12 @@ protected: bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; + void AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; + public: AMDGPUTargetLowering(TargetMachine &TM); - virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; - virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp index 15840b3..ed6c8ec 100644 --- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp +++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp @@ -289,7 +289,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) { // We only need to use REG_SEQUENCE for explicit defs, since the // register coalescer won't do anything with the implicit defs. - MachineInstr *DefInstr = MRI.getVRegDef(Reg); if (!regHasExplicitDef(MRI, Reg)) { continue; } diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 960f108..e740348 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -132,13 +132,6 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst < [(set rc:$dst, (fneg rc:$src0))] >; -def SHADER_TYPE : AMDGPUShaderInst < - (outs), - (ins i32imm:$type), - "SHADER_TYPE $type", - [(int_AMDGPU_shader_type imm:$type)] ->; - } // usesCustomInserter = 1 multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, @@ -209,8 +202,8 @@ class Vector2_Build <ValueType vecType, RegisterClass vectorClass, (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1) >; -class Vector_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < +class Vector4_Build <ValueType vecType, RegisterClass vectorClass, + ValueType elemType, RegisterClass elemClass> : Pat < (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), (elemType elemClass:$z), (elemType elemClass:$w))), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index 2ba2d4b..eecb25b 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -50,8 +50,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - - def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>; } let TargetPrefix = "TGSI", isTarget = 1 in { diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp index 26f842e..b723433 100644 --- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp +++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp @@ -243,6 +243,7 @@ public: initializeRegionInfoPass(*PassRegistry::getPassRegistry()); } + using Pass::doInitialization; virtual bool doInitialization(Region *R, RGPassManager &RGM); virtual bool runOnRegion(Region *R, RGPassManager &RGM); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index e2f00be..0185747 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPU.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" +#include "R600MachineScheduler.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "llvm/Analysis/Passes.h" @@ -39,6 +40,14 @@ extern "C" void LLVMInitializeR600Target() { RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget); } +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMI(C, new R600SchedStrategy()); +} + +static MachineSchedRegistry +SchedCustomRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -70,7 +79,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + enablePass(&MachineSchedulerID); + MachineSchedRegistry::setDefault(createR600MachineScheduler); + } + } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { return getTM<AMDGPUTargetMachine>(); @@ -112,11 +127,6 @@ bool AMDGPUPassConfig::addInstSelector() { } bool AMDGPUPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - - if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { - addPass(createSIAssignInterpRegsPass(*TM)); - } addPass(createAMDGPUConvertToISAPass(*TM)); return false; } @@ -143,7 +153,6 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(&FinalizeMachineBundlesID); - addPass(createR600LowerConstCopy(*TM)); } else { addPass(createSILowerControlFlowPass(*TM)); } diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h index b39fbdb..39ab664 100644 --- a/lib/Target/R600/AMDIL.h +++ b/lib/Target/R600/AMDIL.h @@ -96,24 +96,23 @@ enum AddressSpaces { ADDRESS_NONE = 5, ///< Address space for unknown memory. PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) - USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI - CONSTANT_BUFFER_0 = 9, - CONSTANT_BUFFER_1 = 10, - CONSTANT_BUFFER_2 = 11, - CONSTANT_BUFFER_3 = 12, - CONSTANT_BUFFER_4 = 13, - CONSTANT_BUFFER_5 = 14, - CONSTANT_BUFFER_6 = 15, - CONSTANT_BUFFER_7 = 16, - CONSTANT_BUFFER_8 = 17, - CONSTANT_BUFFER_9 = 18, - CONSTANT_BUFFER_10 = 19, - CONSTANT_BUFFER_11 = 20, - CONSTANT_BUFFER_12 = 21, - CONSTANT_BUFFER_13 = 22, - CONSTANT_BUFFER_14 = 23, - CONSTANT_BUFFER_15 = 24, - LAST_ADDRESS = 25 + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + LAST_ADDRESS = 24 }; } // namespace AMDGPUAS diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index aa8ab6b..b0cd0f9 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -2595,6 +2595,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static int getBranchNzeroOpcode(int oldOpcode) { switch(oldOpcode) { + case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; @@ -2606,6 +2607,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static int getBranchZeroOpcode(int oldOpcode) { switch(oldOpcode) { + case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; @@ -2617,6 +2619,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static int getContinueNzeroOpcode(int oldOpcode) { switch(oldOpcode) { + case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; default: assert(0 && "internal error"); @@ -2626,6 +2629,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static int getContinueZeroOpcode(int oldOpcode) { switch(oldOpcode) { + case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; default: assert(0 && "internal error"); @@ -2654,8 +2658,7 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static bool isCondBranch(MachineInstr *instr) { switch (instr->getOpcode()) { - case AMDGPU::JUMP: - return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0; + case AMDGPU::JUMP_COND: case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: break; @@ -2668,7 +2671,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { static bool isUncondBranch(MachineInstr *instr) { switch (instr->getOpcode()) { case AMDGPU::JUMP: - return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0; case AMDGPU::BRANCH: return true; default: diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp index eec5059..db8e01e 100644 --- a/lib/Target/R600/AMDILDevice.cpp +++ b/lib/Target/R600/AMDILDevice.cpp @@ -115,10 +115,18 @@ bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const { std::string AMDGPUDevice::getDataLayout() const { - return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" - "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" - "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" - "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" - "-v512:512:512-v1024:1024:1024-v2048:2048:2048" - "-n8:16:32:64"); + std::string DataLayout = std::string( + "e" + "-p:32:32:32" + "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128" + "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048" + "-n32:64" + ); + + if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) { + DataLayout.append("-f64:64:64"); + } + + return DataLayout; } diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index e77b9dc..fa8f62d 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -162,6 +162,35 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + case ISD::BUILD_VECTOR: { + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + break; + } + // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + SDValue RegSeqArgs[9] = { + CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), + SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + bool IsRegSeq = true; + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[2 * i + 1] = N->getOperand(i); + } + if (!IsRegSeq) + break; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), + RegSeqArgs, 2 * N->getNumOperands() + 1); + } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); @@ -336,17 +365,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, SDValue Operand = Ops[OperandIdx[i] - 1]; switch (Operand.getOpcode()) { case AMDGPUISD::CONST_ADDRESS: { - if (i == 2) - break; SDValue CstOffset; - if (!Operand.getValueType().isVector() && - SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { - Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); - Ops[SelIdx[i] - 1] = CstOffset; - return true; + if (Operand.getValueType().isVector() || + !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) + break; + + // Gather others constants values + std::vector<unsigned> Consts; + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = OperandIdx[j]; + if (SrcIdx < 0) + break; + if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]); + Consts.push_back(Cst->getZExtValue()); + } + } } + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); + Consts.push_back(Cst->getZExtValue()); + if (!TII->fitsConstReadLimitations(Consts)) + break; + + Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; } - break; case ISD::FNEG: if (NegIdx[i] < 0) break; diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index f65e1f3..922cac1 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -33,11 +33,6 @@ using namespace llvm; //===----------------------------------------------------------------------===// -// Calling Convention Implementation -//===----------------------------------------------------------------------===// -#include "AMDGPUGenCallingConv.inc" - -//===----------------------------------------------------------------------===// // TargetLowering Implementation Help Functions End //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp index 3096c22..0d1de3d 100644 --- a/lib/Target/R600/AMDILSIDevice.cpp +++ b/lib/Target/R600/AMDILSIDevice.cpp @@ -36,10 +36,13 @@ AMDGPUSIDevice::getGeneration() const { std::string AMDGPUSIDevice::getDataLayout() const { - return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16" - "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" - "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" - "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" - "-v512:512:512-v1024:1024:1024-v2048:2048:2048" - "-n8:16:32:64"); + return std::string( + "e" + "-p:64:64:64" + "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128" + "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" + "-v2048:2048:2048" + "-n32:64" + ); } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 00f8b10..63c59e1 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -37,11 +37,10 @@ add_llvm_target(R600CodeGen R600ExpandSpecialInstrs.cpp R600InstrInfo.cpp R600ISelLowering.cpp - R600LowerConstCopy.cpp R600MachineFunctionInfo.cpp + R600MachineScheduler.cpp R600RegisterInfo.cpp SIAnnotateControlFlow.cpp - SIAssignInterpRegs.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h index 8721f80..cd3a7ce 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -33,15 +33,6 @@ public: SmallVectorImpl<MCFixup> &Fixups) const { return 0; } - - virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups) const { - return 0; - } - virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups) const { - return 0; - } }; } // End namespace llvm diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index 6cc0077..e27abcc 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -42,9 +42,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { const MCSubtargetInfo &STI; MCContext &Ctx; - /// \brief Encode a sequence of registers with the correct alignment. - unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const; - /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -65,14 +62,6 @@ public: /// \returns the encoding for an MCOperand. virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups) const; - - /// \brief Encoding for when 2 consecutive registers are used - virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixup) const; - - /// \brief Encoding for when 4 consectuive registers are used - virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixup) const; }; } // End anonymous namespace @@ -212,24 +201,3 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, return 0; } -//===----------------------------------------------------------------------===// -// Custom Operand Encodings -//===----------------------------------------------------------------------===// - -unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo, - unsigned shift) const { - unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg()); - return (regCode & 0xff) >> shift; -} - -unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI, - unsigned OpNo , - SmallVectorImpl<MCFixup> &Fixup) const { - return GPRAlign(MI, OpNo, 1); -} - -unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI, - unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixup) const { - return GPRAlign(MI, OpNo, 2); -} diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index b5c2a93..a73691d 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -50,8 +50,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::UREM, MVT::v4i32, Expand); setOperationAction(ISD::SETCC, MVT::v4i32, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Custom); - setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::FSUB, MVT::f32, Expand); @@ -65,8 +65,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - setOperationAction(ISD::SETCC, MVT::i32, Custom); - setOperationAction(ISD::SETCC, MVT::f32, Custom); + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); @@ -94,6 +94,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::SELECT_CC); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::VLIW); } @@ -105,7 +106,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::SHADER_TYPE: break; case AMDGPU::CLAMP_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, @@ -150,7 +150,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); break; - + case AMDGPU::CONST_COPY: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, + MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, + MI->getOperand(1).getImm()); + break; + } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { @@ -215,8 +221,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::BRANCH: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)) - .addReg(0); + .addOperand(MI->getOperand(0)); break; case AMDGPU::BRANCH_COND_f32: { @@ -227,7 +232,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( .addImm(OPCODE_IS_NOT_ZERO) .addImm(0); // Flags TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) .addOperand(MI->getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; @@ -241,7 +246,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( .addImm(OPCODE_IS_NOT_ZERO_INT) .addImm(0); // Flags TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) .addOperand(MI->getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; @@ -306,11 +311,9 @@ using namespace llvm::AMDGPUIntrinsic; SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::ROTL: return LowerROTL(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); @@ -470,44 +473,6 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { ); } -SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue CC = Op.getOperand(1); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue JumpT = Op.getOperand(4); - SDValue CmpValue; - SDValue Result; - - if (LHS.getValueType() == MVT::i32) { - CmpValue = DAG.getNode( - ISD::SELECT_CC, - Op.getDebugLoc(), - MVT::i32, - LHS, RHS, - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - CC); - } else if (LHS.getValueType() == MVT::f32) { - CmpValue = DAG.getNode( - ISD::SELECT_CC, - Op.getDebugLoc(), - MVT::f32, - LHS, RHS, - DAG.getConstantFP(1.0f, MVT::f32), - DAG.getConstantFP(0.0f, MVT::f32), - CC); - } else { - assert(0 && "Not valid type for br_cc"); - } - Result = DAG.getNode( - AMDGPUISD::BRANCH_COND, - CmpValue.getDebugLoc(), - MVT::Other, Chain, - JumpT, CmpValue); - return Result; -} - SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, DebugLoc DL, unsigned DwordOffset) const { @@ -576,12 +541,37 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const // Check if we can lower this to a native operation. + // Try to lower to a SET* instruction: + // + // SET* can match the following patterns: + // + // select_cc f32, f32, -1, 0, cc_any + // select_cc f32, f32, 1.0f, 0.0f, cc_any + // select_cc i32, i32, -1, 0, cc_any + // + + // Move hardware True/False values to the correct operand. + if (isHWTrueValue(False) && isHWFalseValue(True)) { + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + std::swap(False, True); + CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); + } + + if (isHWTrueValue(True) && isHWFalseValue(False) && + (CompareVT == VT || VT == MVT::i32)) { + // This can be matched by a SET* instruction. + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); + } + // Try to lower to a CND* instruction: - // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that - // can be lowered to CND* instructions can also be lowered to SET* - // instructions. CND* instructions are cheaper, because they dont't - // require additional instructions to convert their result to the correct - // value type, so this check should be first. + // + // CND* can match the following patterns: + // + // select_cc f32, 0.0, f32, f32, cc_any + // select_cc f32, 0.0, i32, i32, cc_any + // select_cc i32, 0, f32, f32, cc_any + // select_cc i32, 0, i32, i32, cc_any + // if (isZero(LHS) || isZero(RHS)) { SDValue Cond = (isZero(LHS) ? RHS : LHS); SDValue Zero = (isZero(LHS) ? LHS : RHS); @@ -623,38 +613,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); } - // Try to lower to a SET* instruction: - // - // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware, - // but for the other case where CompareVT != VT, all operands of - // SELECT_CC need to have the same value type, so we need to change True and - // False to be the same type as LHS and RHS, and then convert the result of - // the select_cc back to the correct type. - - // Move hardware True/False values to the correct operand. - if (isHWTrueValue(False) && isHWFalseValue(True)) { - ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); - std::swap(False, True); - CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); - } - - if (isHWTrueValue(True) && isHWFalseValue(False)) { - if (CompareVT != VT && VT == MVT::f32 && CompareVT == MVT::i32) { - SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - LHS, RHS, - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - CC); - // Convert integer values of true (-1) and false (0) to fp values of - // true (1.0f) and false (0.0f). - SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, - DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); - } else { - // This SELECT_CC is already legal. - return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); - } - } // Possible Min/Max pattern SDValue MinMax = LowerMinMax(Op, DAG); @@ -698,48 +656,6 @@ SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getCondCode(ISD::SETNE)); } -SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - SDValue Cond; - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue CC = Op.getOperand(2); - DebugLoc DL = Op.getDebugLoc(); - assert(Op.getValueType() == MVT::i32); - if (LHS.getValueType() == MVT::i32) { - Cond = DAG.getNode( - ISD::SELECT_CC, - Op.getDebugLoc(), - MVT::i32, - LHS, RHS, - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - CC); - } else if (LHS.getValueType() == MVT::f32) { - Cond = DAG.getNode( - ISD::SELECT_CC, - Op.getDebugLoc(), - MVT::f32, - LHS, RHS, - DAG.getConstantFP(1.0f, MVT::f32), - DAG.getConstantFP(0.0f, MVT::f32), - CC); - Cond = DAG.getNode( - ISD::FP_TO_SINT, - DL, - MVT::i32, - Cond); - } else { - assert(0 && "Not valid type for set_cc"); - } - Cond = DAG.getNode( - ISD::AND, - DL, - MVT::i32, - DAG.getConstant(1, MVT::i32), - Cond); - return Cond; -} - /// LLVM generates byte-addresed pointers. For indirect addressing, we need to /// convert these pointers to a register index. Each register holds /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the @@ -918,7 +834,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (ConstantBlock > -1) { SDValue Result; if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || - dyn_cast<Constant>(LoadNode->getSrcValue())) { + dyn_cast<Constant>(LoadNode->getSrcValue()) || + dyn_cast<ConstantSDNode>(Ptr)) { SDValue Slots[4]; for (unsigned i = 0; i < 4; i++) { // We want Const position encoded with the following formula : @@ -934,7 +851,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const } else { // non constant ptr cant be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)) + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) ); } @@ -1122,6 +1041,9 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: { // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> // selectcc x, y, a, b, inv(cc) + // + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> + // selectcc x, y, a, b, cc SDValue LHS = N->getOperand(0); if (LHS.getOpcode() != ISD::SELECT_CC) { return SDValue(); @@ -1130,24 +1052,30 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SDValue RHS = N->getOperand(1); SDValue True = N->getOperand(2); SDValue False = N->getOperand(3); + ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); if (LHS.getOperand(2).getNode() != True.getNode() || LHS.getOperand(3).getNode() != False.getNode() || - RHS.getNode() != False.getNode() || - cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) { + RHS.getNode() != False.getNode()) { return SDValue(); } - ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get(); - CCOpcode = ISD::getSetCCInverse( - CCOpcode, LHS.getOperand(0).getValueType().isInteger()); - return DAG.getSelectCC(N->getDebugLoc(), - LHS.getOperand(0), - LHS.getOperand(1), - LHS.getOperand(2), - LHS.getOperand(3), - CCOpcode); + switch (NCC) { + default: return SDValue(); + case ISD::SETNE: return LHS; + case ISD::SETEQ: { + ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); + LHSCC = ISD::getSetCCInverse(LHSCC, + LHS.getOperand(0).getValueType().isInteger()); + return DAG.getSelectCC(N->getDebugLoc(), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + LHSCC); } + } + } case AMDGPUISD::EXPORT: { SDValue Arg = N->getOperand(1); if (Arg.getOpcode() != ISD::BUILD_VECTOR) diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index afa3897..5cb4b91 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -52,14 +52,11 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; - /// \brief Lower ROTL opcode to BITALIGN SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 7e3f005..0865098 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::OP3)); } +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) + const { + assert (Consts.size() <= 12 && "Too many operands in instructions group"); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + unsigned ReadConstHalf = Consts[i] & 2; + unsigned ReadConstIndex = Consts[i] & (~3); + unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; + if (!Pair1) { + Pair1 = ReadHalfConst; + continue; + } + if (Pair1 == ReadHalfConst) + continue; + if (!Pair2) { + Pair2 = ReadHalfConst; + continue; + } + if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const { + std::vector<unsigned> Consts; + for (unsigned i = 0, n = MIs.size(); i < n; i++) { + const MachineInstr *MI = MIs[i]; + + const R600Operands::Ops OpTable[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL}, + }; + + if (!isALUInstr(MI->getOpcode())) + continue; + + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (SrcIdx < 0) + break; + if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { + unsigned Const = MI->getOperand( + getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); + Consts.push_back(Const); + } + } + } + return fitsConstReadLimitations(Consts); +} + DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, const ScheduleDAG *DAG) const { const InstrItineraryData *II = TM->getInstrItineraryData(); @@ -168,6 +222,11 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB, return NULL; } +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + bool R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -186,7 +245,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) { + if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) { return false; } @@ -196,22 +255,20 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst->getOpcode(); if (I == MBB.begin() || - static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) { + !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { if (LastOpc == AMDGPU::JUMP) { - if(!isPredicated(LastInst)) { - TBB = LastInst->getOperand(0).getMBB(); - return false; - } else { - MachineInstr *predSet = I; - while (!isPredicateSetter(predSet->getOpcode())) { - predSet = --I; - } - TBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(predSet->getOperand(1)); - Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); - return false; + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastOpc == AMDGPU::JUMP_COND) { + MachineInstr *predSet = I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; } + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; } return true; // Can't handle indirect branch. } @@ -221,10 +278,7 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, unsigned SecondLastOpc = SecondLastInst->getOpcode(); // If the block ends with a B and a Bcc, handle it. - if (SecondLastOpc == AMDGPU::JUMP && - isPredicated(SecondLastInst) && - LastOpc == AMDGPU::JUMP && - !isPredicated(LastInst)) { + if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { MachineInstr *predSet = --I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -261,7 +315,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, if (FBB == 0) { if (Cond.empty()) { - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); return 1; } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); @@ -269,7 +323,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, addFlag(PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)) + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) .addMBB(TBB) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); return 1; @@ -279,10 +333,10 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, assert(PredSet && "No previous predicate !"); addFlag(PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)) + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) .addMBB(TBB) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); return 2; } } @@ -302,11 +356,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { switch (I->getOpcode()) { default: return 0; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + break; + } case AMDGPU::JUMP: - if (isPredicated(I)) { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - } I->eraseFromParent(); break; } @@ -320,11 +376,13 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { // FIXME: only one case?? default: return 1; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + break; + } case AMDGPU::JUMP: - if (isPredicated(I)) { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - } I->eraseFromParent(); break; } @@ -356,6 +414,8 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const { if (MI->getOpcode() == AMDGPU::KILLGT) { return false; + } else if (isVector(*MI)) { + return false; } else { return AMDGPUInstrInfo::isPredicable(MI); } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index efe721c..bf9569e 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -53,6 +53,9 @@ namespace llvm { /// \returns true if this \p Opcode represents an ALU instruction. bool isALUInstr(unsigned Opcode) const; + bool fitsConstReadLimitations(const std::vector<unsigned>&) const; + bool canBundle(const std::vector<MachineInstr *> &) const; + /// \breif Vector instructions are instructions that must fill all /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 8242df9..8c50d54 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -512,8 +512,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst < []>; def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", - SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, - [SDNPMayLoad] + SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPVariadic] >; //===----------------------------------------------------------------------===// @@ -1090,12 +1090,12 @@ class COS_Common <bits<11> inst> : R600_1OP < multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), - (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) >; def : Pat< (fdiv R600_Reg32:$src0, R600_Reg32:$src1), - (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) >; } @@ -1169,12 +1169,12 @@ let Predicates = [isR600] in { // cards. class COS_PAT <InstR600 trig> : Pat< (fcos R600_Reg32:$src), - (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) >; class SIN_PAT <InstR600 trig> : Pat< (fsin R600_Reg32:$src), - (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) >; //===----------------------------------------------------------------------===// @@ -1587,19 +1587,28 @@ def PRED_X : InstR600 < (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), "", [], NullALU> { let FlagOperandIdx = 3; - let isTerminator = 1; } -let isTerminator = 1, isBranch = 1, isBarrier = 1 in { - -def JUMP : InstR600 <0x10, +let isTerminator = 1, isBranch = 1 in { +def JUMP_COND : InstR600 <0x10, (outs), - (ins brtarget:$target, R600_Pred:$p), + (ins brtarget:$target, R600_Predicate_Bit:$p), "JUMP $target ($p)", [], AnyALU >; -} // End isTerminator = 1, isBranch = 1, isBarrier = 1 +def JUMP : InstR600 <0x10, + (outs), + (ins brtarget:$target), + "JUMP $target", + [], AnyALU + > +{ + let isPredicable = 1; + let isBarrier = 1; +} + +} // End isTerminator = 1, isBranch = 1 let usesCustomInserter = 1 in { @@ -1639,7 +1648,7 @@ def FNEG_R600 : FNEG<R600_Reg32>; //===---------------------------------------------------------------------===// // Return instruction //===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, usesCustomInserter = 1 in { def RETURN : ILFormat<(outs), (ins variable_ops), "RETURN", [(IL_retflag)]>; @@ -1650,27 +1659,27 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, // Constant Buffer Addressing Support //===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { def CONST_COPY : Instruction { let OutOperandList = (outs R600_Reg32:$dst); let InOperandList = (ins i32imm:$src); - let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; let AsmString = "CONST_COPY"; let neverHasSideEffects = 1; let isAsCheapAsAMove = 1; let Itinerary = NullALU; } -} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" def TEX_VTX_CONSTBUF : - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr", - [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>, + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", + [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; let FETCH_TYPE = 2; let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = 0; let SRC_REL = 0; let SRC_SEL_X = 0; let DST_REL = 0; @@ -1840,6 +1849,18 @@ let isTerminator=1 in { // ISel Patterns //===----------------------------------------------------------------------===// +// CND*_INT Pattterns for f32 True / False values + +class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < + (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1), + R600_Reg32:$src2, cc), + (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) +>; + +def : CND_INT_f32 <CNDE_INT, SETEQ>; +def : CND_INT_f32 <CNDGT_INT, SETGT>; +def : CND_INT_f32 <CNDGE_INT, SETGE>; + //CNDGE_INT extra pattern def : Pat < (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1), @@ -1958,8 +1979,8 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>; def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>; def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>; -def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>; -def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>; +def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>; +def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>; // bitconvert patterns diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp deleted file mode 100644 index 3ebe653..0000000 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ /dev/null @@ -1,222 +0,0 @@ -//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr. -/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot -/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits -/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try -/// to fold them if possible or replace them by MOV otherwise. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "R600InstrInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/IR/GlobalValue.h" - -namespace llvm { - -class R600LowerConstCopy : public MachineFunctionPass { -private: - static char ID; - const R600InstrInfo *TII; - - struct ConstPairs { - unsigned XYPair; - unsigned ZWPair; - }; - - bool canFoldInBundle(ConstPairs &UsedConst, unsigned ReadConst) const; -public: - R600LowerConstCopy(TargetMachine &tm); - virtual bool runOnMachineFunction(MachineFunction &MF); - - const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; } -}; - -char R600LowerConstCopy::ID = 0; - -R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) : - MachineFunctionPass(ID), - TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) -{ -} - -bool R600LowerConstCopy::canFoldInBundle(ConstPairs &UsedConst, - unsigned ReadConst) const { - unsigned ReadConstChan = ReadConst & 3; - unsigned ReadConstIndex = ReadConst & (~3); - if (ReadConstChan < 2) { - if (!UsedConst.XYPair) { - UsedConst.XYPair = ReadConstIndex; - } - return UsedConst.XYPair == ReadConstIndex; - } else { - if (!UsedConst.ZWPair) { - UsedConst.ZWPair = ReadConstIndex; - } - return UsedConst.ZWPair == ReadConstIndex; - } -} - -static bool isControlFlow(const MachineInstr &MI) { - return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) || - (MI.getOpcode() == AMDGPU::ENDIF) || - (MI.getOpcode() == AMDGPU::ELSE) || - (MI.getOpcode() == AMDGPU::WHILELOOP) || - (MI.getOpcode() == AMDGPU::BREAK); -} - -bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) { - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - DenseMap<unsigned, MachineInstr *> RegToConstIndex; - for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(), - E = MBB.instr_end(); I != E;) { - - if (I->getOpcode() == AMDGPU::CONST_COPY) { - MachineInstr &MI = *I; - I = llvm::next(I); - unsigned DstReg = MI.getOperand(0).getReg(); - DenseMap<unsigned, MachineInstr *>::iterator SrcMI = - RegToConstIndex.find(DstReg); - if (SrcMI != RegToConstIndex.end()) { - SrcMI->second->eraseFromParent(); - RegToConstIndex.erase(SrcMI); - } - MachineInstr *NewMI = - TII->buildDefaultInstruction(MBB, &MI, AMDGPU::MOV, - MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, - MI.getOperand(1).getImm()); - RegToConstIndex[DstReg] = NewMI; - MI.eraseFromParent(); - continue; - } - - std::vector<unsigned> Defs; - // We consider all Instructions as bundled because algorithm that handle - // const read port limitations inside an IG is still valid with single - // instructions. - std::vector<MachineInstr *> Bundle; - - if (I->isBundle()) { - unsigned BundleSize = I->getBundleSize(); - for (unsigned i = 0; i < BundleSize; i++) { - I = llvm::next(I); - Bundle.push_back(I); - } - } else if (TII->isALUInstr(I->getOpcode())){ - Bundle.push_back(I); - } else if (isControlFlow(*I)) { - RegToConstIndex.clear(); - I = llvm::next(I); - continue; - } else { - MachineInstr &MI = *I; - for (MachineInstr::mop_iterator MOp = MI.operands_begin(), - MOpE = MI.operands_end(); MOp != MOpE; ++MOp) { - MachineOperand &MO = *MOp; - if (!MO.isReg()) - continue; - if (MO.isDef()) { - Defs.push_back(MO.getReg()); - } else { - // Either a TEX or an Export inst, prevent from erasing def of used - // operand - RegToConstIndex.erase(MO.getReg()); - for (MCSubRegIterator SR(MO.getReg(), &TII->getRegisterInfo()); - SR.isValid(); ++SR) { - RegToConstIndex.erase(*SR); - } - } - } - } - - - R600Operands::Ops OpTable[3][2] = { - {R600Operands::SRC0, R600Operands::SRC0_SEL}, - {R600Operands::SRC1, R600Operands::SRC1_SEL}, - {R600Operands::SRC2, R600Operands::SRC2_SEL}, - }; - - for(std::vector<MachineInstr *>::iterator It = Bundle.begin(), - ItE = Bundle.end(); It != ItE; ++It) { - MachineInstr *MI = *It; - if (TII->isPredicated(MI)) { - // We don't want to erase previous assignment - RegToConstIndex.erase(MI->getOperand(0).getReg()); - } else { - int WriteIDX = TII->getOperandIdx(MI->getOpcode(), R600Operands::WRITE); - if (WriteIDX < 0 || MI->getOperand(WriteIDX).getImm()) - Defs.push_back(MI->getOperand(0).getReg()); - } - } - - ConstPairs CP = {0,0}; - for (unsigned SrcOp = 0; SrcOp < 3; SrcOp++) { - for(std::vector<MachineInstr *>::iterator It = Bundle.begin(), - ItE = Bundle.end(); It != ItE; ++It) { - MachineInstr *MI = *It; - int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[SrcOp][0]); - if (SrcIdx < 0) - continue; - MachineOperand &MO = MI->getOperand(SrcIdx); - DenseMap<unsigned, MachineInstr *>::iterator SrcMI = - RegToConstIndex.find(MO.getReg()); - if (SrcMI != RegToConstIndex.end()) { - MachineInstr *CstMov = SrcMI->second; - int ConstMovSel = - TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL); - unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm(); - if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) { - TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex); - MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST); - } else { - RegToConstIndex.erase(SrcMI); - } - } - } - } - - for (std::vector<unsigned>::iterator It = Defs.begin(), ItE = Defs.end(); - It != ItE; ++It) { - DenseMap<unsigned, MachineInstr *>::iterator SrcMI = - RegToConstIndex.find(*It); - if (SrcMI != RegToConstIndex.end()) { - SrcMI->second->eraseFromParent(); - RegToConstIndex.erase(SrcMI); - } - } - I = llvm::next(I); - } - - if (MBB.succ_empty()) { - for (DenseMap<unsigned, MachineInstr *>::iterator - DI = RegToConstIndex.begin(), DE = RegToConstIndex.end(); - DI != DE; ++DI) { - DI->second->eraseFromParent(); - } - } - } - return false; -} - -FunctionPass *createR600LowerConstCopy(TargetMachine &tm) { - return new R600LowerConstCopy(tm); -} - -} - - diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp index 40aec83..b07a585 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.cpp +++ b/lib/Target/R600/R600MachineFunctionInfo.cpp @@ -14,5 +14,4 @@ using namespace llvm; R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) : MachineFunctionInfo() { - memset(Outputs, 0, sizeof(Outputs)); } diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index 4b901f4..13a46b8 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -26,7 +26,6 @@ public: R600MachineFunctionInfo(const MachineFunction &MF); SmallVector<unsigned, 4> LiveOuts; std::vector<unsigned> IndirectRegs; - SDNode *Outputs[16]; }; } // End llvm namespace diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp new file mode 100644 index 0000000..9074364 --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -0,0 +1,427 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "misched" + +#include "R600MachineScheduler.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include <set> + +using namespace llvm; + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + + DAG = dag; + TII = static_cast<const R600InstrInfo*>(DAG->TII); + TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); + MRI = &DAG->MRI; + Available[IDAlu]->clear(); + Available[IDFetch]->clear(); + Available[IDOther]->clear(); + CurInstKind = IDOther; + CurEmitted = 0; + OccupedSlotsMask = 15; + InstKindLimit[IDAlu] = 120; // 120 minus 8 for security + + + const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) { + InstKindLimit[IDFetch] = 7; // 8 minus 1 for security + } else { + InstKindLimit[IDFetch] = 15; // 16 minus 1 for security + } +} + +void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst) +{ + if (QSrc->empty()) + return; + for (ReadyQueue::iterator I = QSrc->begin(), + E = QSrc->end(); I != E; ++I) { + (*I)->NodeQueueId &= ~QSrc->getID(); + QDst->push(*I); + } + QSrc->clear(); +} + +SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { + SUnit *SU = 0; + IsTopNode = true; + NextInstKind = IDOther; + + // check if we might want to switch current clause type + bool AllowSwitchToAlu = (CurInstKind == IDOther) || + (CurEmitted > InstKindLimit[CurInstKind]) || + (Available[CurInstKind]->empty()); + bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) && + (!Available[IDFetch]->empty() || !Available[IDOther]->empty()); + + if ((AllowSwitchToAlu && CurInstKind != IDAlu) || + (!AllowSwitchFromAlu && CurInstKind == IDAlu)) { + // try to pick ALU + SU = pickAlu(); + if (SU) { + if (CurEmitted > InstKindLimit[IDAlu]) + CurEmitted = 0; + NextInstKind = IDAlu; + } + } + + if (!SU) { + // try to pick FETCH + SU = pickOther(IDFetch); + if (SU) + NextInstKind = IDFetch; + } + + // try to pick other + if (!SU) { + SU = pickOther(IDOther); + if (SU) + NextInstKind = IDOther; + } + + DEBUG( + if (SU) { + dbgs() << "picked node: "; + SU->dump(DAG); + } else { + dbgs() << "NO NODE "; + for (int i = 0; i < IDLast; ++i) { + Available[i]->dump(); + Pending[i]->dump(); + } + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + } + ); + + return SU; +} + +void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + + DEBUG(dbgs() << "scheduled: "); + DEBUG(SU->dump(DAG)); + + if (NextInstKind != CurInstKind) { + DEBUG(dbgs() << "Instruction Type Switch\n"); + if (NextInstKind != IDAlu) + OccupedSlotsMask = 15; + CurEmitted = 0; + CurInstKind = NextInstKind; + } + + if (CurInstKind == IDAlu) { + switch (getAluKind(SU)) { + case AluT_XYZW: + CurEmitted += 4; + break; + case AluDiscarded: + break; + default: { + ++CurEmitted; + for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), + E = SU->getInstr()->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++CurEmitted; + } + } + } + } else { + ++CurEmitted; + } + + + DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + + if (CurInstKind != IDFetch) { + MoveUnits(Pending[IDFetch], Available[IDFetch]); + } + MoveUnits(Pending[IDOther], Available[IDOther]); +} + +void R600SchedStrategy::releaseTopNode(SUnit *SU) { + int IK = getInstKind(SU); + + DEBUG(dbgs() << IK << " <= "); + DEBUG(SU->dump(DAG)); + + Pending[IK]->push(SU); +} + +void R600SchedStrategy::releaseBottomNode(SUnit *SU) { +} + +bool R600SchedStrategy::regBelongsToClass(unsigned Reg, + const TargetRegisterClass *RC) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->contains(Reg); + } else { + return MRI->getRegClass(Reg) == RC; + } +} + +R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + return AluT_XYZW; + case AMDGPU::COPY: + if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) { + // %vregX = COPY Tn_X is likely to be discarded in favor of an + // assignement of Tn_X to %vregX, don't considers it in scheduling + return AluDiscarded; + } + else if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; + } + default: + break; + } + + // Does the instruction take a whole IG ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return AluT_XYZW; + + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + return AluAny; + +} + +int R600SchedStrategy::getInstKind(SUnit* SU) { + int Opcode = SU->getInstr()->getOpcode(); + + if (TII->isALUInstr(Opcode)) { + return IDAlu; + } + + switch (Opcode) { + case AMDGPU::COPY: + case AMDGPU::CONST_COPY: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT4_eg_pseudo: + case AMDGPU::DOT4_r600_pseudo: + return IDAlu; + case AMDGPU::TEX_VTX_CONSTBUF: + case AMDGPU::TEX_VTX_TEXBUF: + case AMDGPU::TEX_LD: + case AMDGPU::TEX_GET_TEXTURE_RESINFO: + case AMDGPU::TEX_GET_GRADIENTS_H: + case AMDGPU::TEX_GET_GRADIENTS_V: + case AMDGPU::TEX_SET_GRADIENTS_H: + case AMDGPU::TEX_SET_GRADIENTS_V: + case AMDGPU::TEX_SAMPLE: + case AMDGPU::TEX_SAMPLE_C: + case AMDGPU::TEX_SAMPLE_L: + case AMDGPU::TEX_SAMPLE_C_L: + case AMDGPU::TEX_SAMPLE_LB: + case AMDGPU::TEX_SAMPLE_C_LB: + case AMDGPU::TEX_SAMPLE_G: + case AMDGPU::TEX_SAMPLE_C_G: + case AMDGPU::TXD: + case AMDGPU::TXD_SHADOW: + return IDFetch; + default: + DEBUG( + dbgs() << "other inst: "; + SU->dump(DAG); + ); + return IDOther; + } +} + +SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) { + if (Q.empty()) + return NULL; + for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end(); + It != E; ++It) { + SUnit *SU = *It; + InstructionsGroupCandidate.push_back(SU->getInstr()); + if (TII->canBundle(InstructionsGroupCandidate)) { + InstructionsGroupCandidate.pop_back(); + Q.erase(It); + return SU; + } else { + InstructionsGroupCandidate.pop_back(); + } + } + return NULL; +} + +void R600SchedStrategy::LoadAlu() { + ReadyQueue *QSrc = Pending[IDAlu]; + for (ReadyQueue::iterator I = QSrc->begin(), + E = QSrc->end(); I != E; ++I) { + (*I)->NodeQueueId &= ~QSrc->getID(); + AluKind AK = getAluKind(*I); + AvailableAlus[AK].insert(*I); + } + QSrc->clear(); +} + +void R600SchedStrategy::PrepareNextSlot() { + DEBUG(dbgs() << "New Slot\n"); + assert (OccupedSlotsMask && "Slot wasn't filled"); + OccupedSlotsMask = 0; + InstructionsGroupCandidate.clear(); + LoadAlu(); +} + +void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { + unsigned DestReg = MI->getOperand(0).getReg(); + // PressureRegister crashes if an operand is def and used in the same inst + // and we try to constraint its regclass + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && !MO.isDef() && + MO.getReg() == MI->getOperand(0).getReg()) + return; + } + // Constrains the regclass of DestReg to assign it to Slot + switch (Slot) { + case 0: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + break; + case 1: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + break; + case 2: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + break; + case 3: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + break; + } +} + +SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) { + static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; + SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]); + SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]); + if (!UnslotedSU) { + return SlotedSU; + } else if (!SlotedSU) { + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; + } else { + //Determine which one to pick (the lesser one) + if (CompareSUnit()(SlotedSU, UnslotedSU)) { + AvailableAlus[AluAny].insert(UnslotedSU); + return SlotedSU; + } else { + AvailableAlus[IndexToID[Slot]].insert(SlotedSU); + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; + } + } +} + +bool R600SchedStrategy::isAvailablesAluEmpty() const { + return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() && + AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() && + AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() && + AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty(); +} + +SUnit* R600SchedStrategy::pickAlu() { + while (!isAvailablesAluEmpty()) { + if (!OccupedSlotsMask) { + // Flush physical reg copies (RA will discard them) + if (!AvailableAlus[AluDiscarded].empty()) { + OccupedSlotsMask = 15; + return PopInst(AvailableAlus[AluDiscarded]); + } + // If there is a T_XYZW alu available, use it + if (!AvailableAlus[AluT_XYZW].empty()) { + OccupedSlotsMask = 15; + return PopInst(AvailableAlus[AluT_XYZW]); + } + } + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool isOccupied = OccupedSlotsMask & (1 << Chan); + if (!isOccupied) { + SUnit *SU = AttemptFillSlot(Chan); + if (SU) { + OccupedSlotsMask |= (1 << Chan); + InstructionsGroupCandidate.push_back(SU->getInstr()); + return SU; + } + } + } + PrepareNextSlot(); + } + return NULL; +} + +SUnit* R600SchedStrategy::pickOther(int QID) { + SUnit *SU = 0; + ReadyQueue *AQ = Available[QID]; + + if (AQ->empty()) { + MoveUnits(Pending[QID], AQ); + } + if (!AQ->empty()) { + SU = *AQ->begin(); + AQ->remove(AQ->begin()); + } + return SU; +} + diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h new file mode 100644 index 0000000..3d0367f --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.h @@ -0,0 +1,120 @@ +//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef R600MACHINESCHEDULER_H_ +#define R600MACHINESCHEDULER_H_ + +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/PriorityQueue.h" + +using namespace llvm; + +namespace llvm { + +class CompareSUnit { +public: + bool operator()(const SUnit *S1, const SUnit *S2) { + return S1->getDepth() > S2->getDepth(); + } +}; + +class R600SchedStrategy : public MachineSchedStrategy { + + const ScheduleDAGMI *DAG; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + MachineRegisterInfo *MRI; + + enum InstQueue { + QAlu = 1, + QFetch = 2, + QOther = 4 + }; + + enum InstKind { + IDAlu, + IDFetch, + IDOther, + IDLast + }; + + enum AluKind { + AluAny, + AluT_X, + AluT_Y, + AluT_Z, + AluT_W, + AluT_XYZW, + AluDiscarded, // LLVM Instructions that are going to be eliminated + AluLast + }; + + ReadyQueue *Available[IDLast], *Pending[IDLast]; + std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast]; + + InstKind CurInstKind; + int CurEmitted; + InstKind NextInstKind; + + int InstKindLimit[IDLast]; + + int OccupedSlotsMask; + +public: + R600SchedStrategy() : + DAG(0), TII(0), TRI(0), MRI(0) { + Available[IDAlu] = new ReadyQueue(QAlu, "AAlu"); + Available[IDFetch] = new ReadyQueue(QFetch, "AFetch"); + Available[IDOther] = new ReadyQueue(QOther, "AOther"); + Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu"); + Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch"); + Pending[IDOther] = new ReadyQueue(QOther<<4, "POther"); + } + + virtual ~R600SchedStrategy() { + for (unsigned I = 0; I < IDLast; ++I) { + delete Available[I]; + delete Pending[I]; + } + } + + virtual void initialize(ScheduleDAGMI *dag); + virtual SUnit *pickNode(bool &IsTopNode); + virtual void schedNode(SUnit *SU, bool IsTopNode); + virtual void releaseTopNode(SUnit *SU); + virtual void releaseBottomNode(SUnit *SU); + +private: + std::vector<MachineInstr *> InstructionsGroupCandidate; + + int getInstKind(SUnit *SU); + bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; + AluKind getAluKind(SUnit *SU) const; + void LoadAlu(); + bool isAvailablesAluEmpty() const; + SUnit *AttemptFillSlot (unsigned Slot); + void PrepareNextSlot(); + SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q); + + void AssignSlot(MachineInstr *MI, unsigned Slot); + SUnit* pickAlu(); + SUnit* pickOther(int QID); + void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst); +}; + +} // namespace llvm + +#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/lib/Target/R600/SIAssignInterpRegs.cpp b/lib/Target/R600/SIAssignInterpRegs.cpp deleted file mode 100644 index 832e44d..0000000 --- a/lib/Target/R600/SIAssignInterpRegs.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass maps the pseudo interpolation registers to the correct physical -/// registers. -// -/// Prior to executing a fragment shader, the GPU loads interpolation -/// parameters into physical registers. The specific physical register that each -/// interpolation parameter ends up in depends on the type of the interpolation -/// parameter as well as how many interpolation parameters are used by the -/// shader. -// -//===----------------------------------------------------------------------===// - - - -#include "AMDGPU.h" -#include "AMDIL.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -class SIAssignInterpRegsPass : public MachineFunctionPass { - -private: - static char ID; - TargetMachine &TM; - - void addLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI, - unsigned physReg, unsigned virtReg); - -public: - SIAssignInterpRegsPass(TargetMachine &tm) : - MachineFunctionPass(ID), TM(tm) { } - - virtual bool runOnMachineFunction(MachineFunction &MF); - - const char *getPassName() const { return "SI Assign intrpolation registers"; } -}; - -} // End anonymous namespace - -char SIAssignInterpRegsPass::ID = 0; - -#define INTERP_VALUES 16 -#define REQUIRED_VALUE_MAX_INDEX 7 - -struct InterpInfo { - bool Enabled; - unsigned Regs[3]; - unsigned RegCount; -}; - - -FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) { - return new SIAssignInterpRegsPass(tm); -} - -bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) { - - struct InterpInfo InterpUse[INTERP_VALUES] = { - {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2}, - {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2}, - {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2}, - {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3}, - {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2}, - {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2}, - {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2}, - {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1}, - {false, {AMDGPU::POS_X_FLOAT}, 1}, - {false, {AMDGPU::POS_Y_FLOAT}, 1}, - {false, {AMDGPU::POS_Z_FLOAT}, 1}, - {false, {AMDGPU::POS_W_FLOAT}, 1}, - {false, {AMDGPU::FRONT_FACE}, 1}, - {false, {AMDGPU::ANCILLARY}, 1}, - {false, {AMDGPU::SAMPLE_COVERAGE}, 1}, - {false, {AMDGPU::POS_FIXED_PT}, 1} - }; - - SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); - // This pass is only needed for pixel shaders. - if (MFI->ShaderType != ShaderType::PIXEL) { - return false; - } - MachineRegisterInfo &MRI = MF.getRegInfo(); - bool ForceEnable = true; - - // First pass, mark the interpolation values that are used. - for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) { - for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount; - RegIdx++) { - InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled || - !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]); - if (InterpUse[InterpIdx].Enabled && - InterpIdx <= REQUIRED_VALUE_MAX_INDEX) { - ForceEnable = false; - } - } - } - - // At least one interpolation mode must be enabled or else the GPU will hang. - if (ForceEnable) { - InterpUse[0].Enabled = true; - } - - unsigned UsedVgprs = 0; - - // Second pass, replace with VGPRs. - for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) { - if (!InterpUse[InterpIdx].Enabled) { - continue; - } - MFI->SPIPSInputAddr |= (1 << InterpIdx); - - for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount; - RegIdx++, UsedVgprs++) { - unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs); - unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg); - addLiveIn(&MF, MRI, NewReg, VirtReg); - } - } - - return false; -} - -void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF, - MachineRegisterInfo & MRI, - unsigned physReg, unsigned virtReg) { - const TargetInstrInfo * TII = TM.getInstrInfo(); - if (!MRI.isLiveIn(physReg)) { - MRI.addLiveIn(physReg, virtReg); - MF->front().addLiveIn(physReg); - BuildMI(MF->front(), MF->front().begin(), DebugLoc(), - TII->get(TargetOpcode::COPY), virtReg) - .addReg(physReg); - } else { - MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg)); - } -} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a0fbd9..93f8c38 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,10 +14,13 @@ #include "SIISelLowering.h" #include "AMDIL.h" +#include "AMDGPU.h" #include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -28,30 +31,41 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM), TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())), TRI(TM.getRegisterInfo()) { - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); + + addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); computeRegisterProperties(); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - - // We need to custom lower loads from the USER_SGPR address space, so we can - // add the SGPRs as livein registers. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -59,6 +73,137 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); + + setSchedulingPreference(Sched::Source); +} + +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + assert(CallConv == CallingConv::C); + + SmallVector<ISD::InputArg, 16> Splits; + uint32_t Skipped = 0; + + for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + + // First check if it's a PS input addr + if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { + + assert((PSInputNum <= 15) && "Too many PS inputs!"); + + if (!Arg.Used) { + // We can savely skip PS inputs + Skipped |= 1 << i; + ++PSInputNum; + continue; + } + + Info->PSInputAddr |= 1 << PSInputNum++; + } + + // Second split vertices into their elements + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eigth. + Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + + } else { + Splits.push_back(Arg); + } + } + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // At least one interpolation mode must be enabled or else the GPU will hang. + if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { + Info->PSInputAddr |= 1; + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + } + + AnalyzeFormalArguments(CCInfo, Splits); + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + + if (Skipped & (1 << i)) { + InVals.push_back(SDValue()); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + MVT VT = VA.getLocVT(); + + if (VT == MVT::i64) { + // For now assume it is a pointer + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, + &AMDGPU::SReg_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + continue; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + const ISD::InputArg &Arg = Ins[i]; + if (Arg.VT.isVector()) { + + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.OrigArgIndex); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + for (unsigned j = 0; j != NumElements; ++j) + Regs.push_back(DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, + Regs.data(), Regs.size())); + continue; + } + + InVals.push_back(Val); + } + return Chain; } MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( @@ -70,15 +215,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SHADER_TYPE: - BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType = - MI->getOperand(0).getImm(); - MI->eraseFromParent(); - break; - - case AMDGPU::SI_INTERP: - LowerSI_INTERP(MI, *BB, I, MRI); - break; case AMDGPU::SI_WQM: LowerSI_WQM(MI, *BB, I, MRI); break; @@ -94,41 +230,14 @@ void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, MI->eraseFromParent(); } -void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { - unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); - MachineOperand dst = MI->getOperand(0); - MachineOperand iReg = MI->getOperand(1); - MachineOperand jReg = MI->getOperand(2); - MachineOperand attr_chan = MI->getOperand(3); - MachineOperand attr = MI->getOperand(4); - MachineOperand params = MI->getOperand(5); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) - .addOperand(params); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp) - .addOperand(iReg) - .addOperand(attr_chan) - .addOperand(attr) - .addReg(M0); - - BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32)) - .addOperand(dst) - .addReg(tmp) - .addOperand(jReg) - .addOperand(attr_chan) - .addOperand(attr) - .addReg(M0); - - MI->eraseFromParent(); -} - EVT SITargetLowering::getSetCCResultType(EVT VT) const { return MVT::i1; } +MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { + return MVT::i32; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -137,20 +246,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); - case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID = - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); - switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_vs_load_buffer_index: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR0, VT); - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - } - break; - } } return SDValue(); } @@ -249,47 +345,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op); - - assert(Ptr); - - unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace(); - - // We only need to lower USER_SGPR address space loads - if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) { - return SDValue(); - } - - // Loads from the USER_SGPR address space can only have constant value - // pointers. - ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr()); - assert(BasePtr); - - unsigned TypeDwordWidth = VT.getSizeInBits() / 32; - const TargetRegisterClass * dstClass; - switch (TypeDwordWidth) { - default: - assert(!"USER_SGPR value size not implemented"); - return SDValue(); - case 1: - dstClass = &AMDGPU::SReg_32RegClass; - break; - case 2: - dstClass = &AMDGPU::SReg_64RegClass; - break; - } - uint64_t Index = BasePtr->getZExtValue(); - assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned"); - unsigned SGPRIndex = Index / TypeDwordWidth; - unsigned Reg = dstClass->getRegister(SGPRIndex); - - DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg, - VT)); - return SDValue(); -} - SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 737162f..d656225 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -24,14 +24,9 @@ class SITargetLowering : public AMDGPUTargetLowering { const SIInstrInfo * TII; const TargetRegisterInfo * TRI; - void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, unsigned Opocde) const; - void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, - MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -43,9 +38,17 @@ class SITargetLowering : public AMDGPUTargetLowering { public: SITargetLowering(TargetMachine &tm); + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const; virtual EVT getSetCCResultType(EVT VT) const; + virtual MVT getScalarShiftAmountTy(EVT VT) const; virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 24fc929..98bd3db 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -88,6 +88,9 @@ private: MachineBasicBlock::iterator I, const Counters &Counts); + /// \brief Do we need def2def checks? + bool unorderedDefines(MachineInstr &MI); + /// \brief Resolve all operand dependencies to counter requirements Counters handleOperands(MachineInstr &MI); @@ -125,7 +128,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // Only consider stores or EXP for EXP_CNT Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && - (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore())); + (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { @@ -311,8 +314,10 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { RegInterval Interval = getRegInterval(Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { - if (Op.isDef()) + if (Op.isDef()) { increaseCounters(Result, UsedRegs[j]); + increaseCounters(Result, DefinedRegs[j]); + } if (Op.isUse()) increaseCounters(Result, DefinedRegs[j]); diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index fe417d6..3891ddb 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -129,12 +129,12 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm, list<dag> pattern> : Enc32<outs, ins, asm, pattern> { bits<7> SDST; - bits<6> SBASE; + bits<7> SBASE; bits<8> OFFSET; let Inst{7-0} = OFFSET; let Inst{8} = imm; - let Inst{14-9} = SBASE; + let Inst{14-9} = SBASE{6-1}; let Inst{21-15} = SDST; let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding @@ -292,7 +292,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : bits<1> ADDR64; bits<1> LDS; bits<8> VADDR; - bits<5> SRSRC; + bits<7> SRSRC; bits<1> SLC; bits<1> TFE; bits<8> SOFFSET; @@ -307,7 +307,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{31-26} = 0x38; //encoding let Inst{39-32} = VADDR; let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC; + let Inst{52-48} = SRSRC{6-2}; let Inst{54} = SLC; let Inst{55} = TFE; let Inst{63-56} = SOFFSET; @@ -330,7 +330,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : bits<4> DFMT; bits<3> NFMT; bits<8> VADDR; - bits<5> SRSRC; + bits<7> SRSRC; bits<1> SLC; bits<1> TFE; bits<8> SOFFSET; @@ -346,7 +346,7 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = VADDR; let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC; + let Inst{52-48} = SRSRC{6-2}; let Inst{54} = SLC; let Inst{55} = TFE; let Inst{63-56} = SOFFSET; @@ -370,8 +370,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : bits<1> LWE; bits<1> SLC; bits<8> VADDR; - bits<5> SRSRC; - bits<5> SSAMP; + bits<7> SRSRC; + bits<7> SSAMP; let Inst{11-8} = DMASK; let Inst{12} = UNORM; @@ -385,8 +385,8 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{31-26} = 0x3c; let Inst{39-32} = VADDR; let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC; - let Inst{57-53} = SSAMP; + let Inst{52-48} = SRSRC{6-2}; + let Inst{57-53} = SSAMP{6-2}; let VM_CNT = 1; let EXP_CNT = 1; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index d9dbd6a..de2373b 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -33,36 +33,108 @@ const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const { void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // If we are trying to copy to or from SCC, there is a bug somewhere else in // the backend. While it may be theoretically possible to do this, it should // never be necessary. assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - if (AMDGPU::VReg_64RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || - AMDGPU::SReg_64RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0)) - .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc)) - .addReg(DestReg, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1)) - .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc)); + const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + }; + + const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + }; + + const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + }; + + const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, 0 + }; + + unsigned Opcode; + const int16_t *SubIndices; + + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_3; + + } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_7; + + } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_15; + } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) { assert(AMDGPU::VReg_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); + AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || + AMDGPU::SReg_64RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_1; + + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || + AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_3; + + } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || + AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_7; + + } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || + AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_15; + } else { - assert(AMDGPU::SReg_32RegClass.contains(DestReg)); - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + llvm_unreachable("Can't copy register!"); + } + + while (unsigned SubIdx = *SubIndices++) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, SubIdx)); + + Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + + if (*SubIndices) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); } } diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index d6c3f06..2f10c38 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -53,16 +53,6 @@ def SIOperand { int VCC = 0x6A; } -class GPR4Align <RegisterClass rc> : Operand <vAny> { - let EncoderMethod = "GPR4AlignEncode"; - let MIOperandInfo = (ops rc:$reg); -} - -class GPR2Align <RegisterClass rc> : Operand <iPTR> { - let EncoderMethod = "GPR2AlignEncode"; - let MIOperandInfo = (ops rc:$reg); -} - include "SIInstrFormats.td" //===----------------------------------------------------------------------===// @@ -125,16 +115,17 @@ class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK < opName#" $dst, $src0", pattern >; -multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> { +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, + RegisterClass dstClass> { def _IMM : SMRD < op, 1, (outs dstClass:$dst), - (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset), + (ins baseClass:$sbase, i32imm:$offset), asm#" $dst, $sbase, $offset", [] >; def _SGPR : SMRD < op, 0, (outs dstClass:$dst), - (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff), + (ins baseClass:$sbase, SReg_32:$soff), asm#" $dst, $sbase, $soff", [] >; } @@ -276,7 +267,7 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU (outs), (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, - GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []> { @@ -288,7 +279,7 @@ class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF op, (outs regClass:$dst), (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc, + i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, " #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset", @@ -301,7 +292,7 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF op, (outs regClass:$dst), (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, + i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", @@ -315,7 +306,7 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < (outs VReg_128:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr, - GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp), + SReg_256:$srsrc, SReg_128:$ssamp), asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", []> { diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index af116f0..05b04a9 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -403,9 +403,9 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT //def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>; //def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>; //def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>; -//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>; -//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>; -//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>; +def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>; +def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>; +def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; //def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; @@ -458,17 +458,31 @@ def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORM let mayLoad = 1 in { -defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>; +defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>; -//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>; -//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>; -//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>; -//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>; -//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>; -//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>; -//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>; +defm S_BUFFER_LOAD_DWORD : SMRD_Helper < + 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32 +>; + +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < + 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64 +>; + +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < + 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < + 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < + 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512 +>; } // mayLoad = 1 @@ -840,7 +854,9 @@ defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>; defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>; -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", + [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))] +>; defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; let isCommutable = 1 in { @@ -1044,13 +1060,6 @@ def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; let isCodeGenOnly = 1, isPseudo = 1 in { -def SET_M0 : InstSI < - (outs SReg_32:$dst), - (ins i32imm:$src0), - "SET_M0 $dst, $src0", - [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))] ->; - def LOAD_CONST : AMDGPUShaderInst < (outs GPRF32:$dst), (ins i32imm:$src), @@ -1060,13 +1069,6 @@ def LOAD_CONST : AMDGPUShaderInst < let usesCustomInserter = 1 in { -def SI_INTERP : InstSI < - (outs VReg_32:$dst), - (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), - "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params", - [] ->; - def SI_WQM : InstSI < (outs), (ins), @@ -1147,6 +1149,31 @@ def SI_KILL : InstSI < } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 // Uses = [EXEC], Defs = [EXEC] +let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { + +def SI_INDIRECT_SRC : InstSI < + (outs VReg_32:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off", + [] +>; + +class SI_INDIRECT_DST<RegisterClass rc> : InstSI < + (outs rc:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val), + "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val", + [] +> { + let Constraints = "$src = $dst"; +} + +def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; +def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; +def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; +def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; + +} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] + } // end IsCodeGenOnly, isPseudo def : Pat< @@ -1255,22 +1282,83 @@ defm : SamplePatterns<VReg_128, v4i32>; defm : SamplePatterns<VReg_256, v8i32>; defm : SamplePatterns<VReg_512, v16i32>; -def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>; -def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>; -def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>; -def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>; +/********** ============================================ **********/ +/********** Extraction, Insertion, Building and Casting **********/ +/********** ============================================ **********/ + +foreach Index = 0-2 in { + def Extract_Element_v2i32_#Index : Extract_Element < + i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2i32_#Index : Insert_Element < + i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v2f32_#Index : Extract_Element < + f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v2f32_#Index : Insert_Element < + f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-3 in { + def Extract_Element_v4i32_#Index : Extract_Element < + i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4i32_#Index : Insert_Element < + i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + >; -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>; -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>; -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>; -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>; + def Extract_Element_v4f32_#Index : Extract_Element < + f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v4f32_#Index : Insert_Element < + f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-7 in { + def Extract_Element_v8i32_#Index : Extract_Element < + i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8i32_#Index : Insert_Element < + i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v8f32_#Index : Extract_Element < + f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v8f32_#Index : Insert_Element < + f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-15 in { + def Extract_Element_v16i32_#Index : Extract_Element < + i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16i32_#Index : Insert_Element < + i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v16f32_#Index : Extract_Element < + f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v16f32_#Index : Insert_Element < + f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + >; +} def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>; def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>; -def : Vector_Build <v4f32, VReg_128, f32, VReg_32>; -def : Vector_Build <v4i32, VReg_128, i32, VReg_32>; +def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>; +def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>; +def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>; def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>; +def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>; def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>; +def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>; def : BitConvert <i32, f32, SReg_32>; def : BitConvert <i32, f32, VReg_32>; @@ -1305,11 +1393,6 @@ def : Pat < /********** ================== **********/ def : Pat < - (i1 imm:$imm), - (S_MOV_B64 imm:$imm) ->; - -def : Pat < (i32 imm:$imm), (V_MOV_B32_e32 imm:$imm) >; @@ -1320,13 +1403,8 @@ def : Pat < >; def : Pat < - (i32 imm:$imm), - (S_MOV_B32 imm:$imm) ->; - -def : Pat < - (f32 fpimm:$imm), - (S_MOV_B32 fpimm:$imm) + (i1 imm:$imm), + (S_MOV_B64 imm:$imm) >; def : Pat < @@ -1347,58 +1425,16 @@ def : Pat < /********** ===================== **********/ def : Pat < - (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, - (S_MOV_B32 SReg_32:$params)) ->; - -def : Pat < - (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params), - (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan, - imm:$attr, SReg_32:$params) + (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params), + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params) >; def : Pat < - (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params), - (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan, - imm:$attr, SReg_32:$params) ->; - -def : Pat < - (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params), - (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan, - imm:$attr, SReg_32:$params) ->; - -def : Pat < - (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params), - (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan, - imm:$attr, SReg_32:$params) ->; - -def : Pat < - (int_SI_fs_read_face), - (f32 FRONT_FACE) ->; - -def : Pat < - (int_SI_fs_read_pos 0), - (f32 POS_X_FLOAT) ->; - -def : Pat < - (int_SI_fs_read_pos 1), - (f32 POS_Y_FLOAT) ->; - -def : Pat < - (int_SI_fs_read_pos 2), - (f32 POS_Z_FLOAT) ->; - -def : Pat < - (int_SI_fs_read_pos 3), - (f32 POS_W_FLOAT) + (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij), + (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0), + imm:$attr_chan, imm:$attr, M0Reg:$params), + (EXTRACT_SUBREG VReg_64:$ij, sub1), + imm:$attr_chan, imm:$attr, M0Reg:$params) >; /********** ================== **********/ @@ -1455,6 +1491,24 @@ def : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0) >; +// 1. Offset as 8bit DWORD immediate +def : Pat < + (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset) +>; + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (int_SI_load_const SReg_128:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset)) +>; + +// 3. Offset in an 32Bit VGPR +def : Pat < + (int_SI_load_const SReg_128:$sbase, VReg_32:$voff), + (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0) +>; + /********** ================== **********/ /********** VOP3 Patterns **********/ /********** ================== **********/ @@ -1489,7 +1543,51 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>; +defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; + +/********** ====================== **********/ +/********** Indirect adressing **********/ +/********** ====================== **********/ + +multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt, + SI_INDIRECT_DST IndDst> { + // 1. Extract with offset + def : Pat< + (vector_extract (vt rc:$vec), + (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) + ), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off)) + >; + + // 2. Extract without offset + def : Pat< + (vector_extract (vt rc:$vec), + (i64 (zext (i32 VReg_32:$idx))) + ), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0)) + >; + + // 3. Insert with offset + def : Pat< + (vector_insert (vt rc:$vec), (f32 VReg_32:$val), + (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) + ), + (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val)) + >; + + // 4. Insert without offset + def : Pat< + (vector_insert (vt rc:$vec), (f32 VReg_32:$val), + (i64 (zext (i32 VReg_32:$idx))) + ), + (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val)) + >; +} + +defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>; +defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>; +defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>; +defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>; } // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 611b9c4..33bb815 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -16,13 +16,11 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - /* XXX: We may need a seperate intrinsic here for loading integer values */ - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>; - def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ; def int_SI_wqm : Intrinsic <[], [], []>; - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>; + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>; def int_SI_sample : Sample; def int_SI_sampleb : Sample; @@ -30,17 +28,8 @@ let TargetPrefix = "SI", isTarget = 1 in { /* Interpolation Intrinsics */ - def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>; - class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; - - def int_SI_fs_interp_linear_center : Interp; - def int_SI_fs_interp_linear_centroid : Interp; - def int_SI_fs_interp_persp_center : Interp; - def int_SI_fs_interp_persp_centroid : Interp; - def int_SI_fs_interp_constant : Interp; - - def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>; - def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrReadMem]>; /* Control flow Intrinsics */ diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index b215aa2..9a027e7 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -66,6 +66,7 @@ private: static const unsigned SkipThreshold = 12; static char ID; + const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); @@ -84,9 +85,14 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + void LoadM0(MachineInstr &MI, MachineInstr *MovRel); + void IndirectSrc(MachineInstr &MI); + void IndirectDst(MachineInstr &MI); + public: SILowerControlFlowPass(TargetMachine &tm) : - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } + MachineFunctionPass(ID), TRI(tm.getRegisterInfo()), + TII(tm.getInstrInfo()) { } virtual bool runOnMachineFunction(MachineFunction &MF); @@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I = MI; + + unsigned Save = MI.getOperand(1).getReg(); + unsigned Idx = MI.getOperand(3).getReg(); + + if (AMDGPU::SReg_32RegClass.contains(Idx)) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + MBB.insert(I, MovRel); + MI.eraseFromParent(); + return; + } + + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VReg_32RegClass.contains(Idx)); + + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC) + .addReg(Idx); + + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC); + + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + // Do the actual move + MBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); + + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vec = MI.getOperand(2).getReg(); + unsigned Off = MI.getOperand(4).getImm(); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Vec, RegState::Implicit); + + LoadM0(MI, MovRel); +} + +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Off = MI.getOperand(4).getImm(); + unsigned Val = MI.getOperand(5).getReg(); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) + .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define) + .addReg(Val) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Dst, RegState::Implicit); + + LoadM0(MI, MovRel); +} + bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { bool HaveKill = false; @@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_BRANCH: Branch(MI); break; + + case AMDGPU::SI_INDIRECT_SRC: + IndirectSrc(MI); + break; + + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + IndirectDst(MI); + break; } } } diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index 7e59b42..1a4e4cb 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,11 +10,25 @@ #include "SIMachineFunctionInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" using namespace llvm; +const char *SIMachineFunctionInfo::ShaderTypeAttribute = "ShaderType"; + SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : MachineFunctionInfo(), - SPIPSInputAddr(0), - ShaderType(0) - { } + ShaderType(0), + PSInputAddr(0) { + + AttributeSet Set = MF.getFunction()->getAttributes(); + Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, + ShaderTypeAttribute); + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) + llvm_unreachable("Can't parse shader type!"); + } +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 47271f5..91a809b 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -23,9 +23,11 @@ namespace llvm { /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public MachineFunctionInfo { public: + static const char *ShaderTypeAttribute; + SIMachineFunctionInfo(const MachineFunction &MF); - unsigned SPIPSInputAddr; unsigned ShaderType; + unsigned PSInputAddr; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 9e04e24..4f14931 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -34,32 +34,6 @@ foreach Index = 0-255 in { } } -// virtual Interpolation registers -def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">; -def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">; -def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">; -def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">; -def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">; -def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">; -def PERSP_I_W : SIReg <"PERSP_I_W">; -def PERSP_J_W : SIReg <"PERSP_J_W">; -def PERSP_1_W : SIReg <"PERSP_1_W">; -def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">; -def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">; -def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">; -def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">; -def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">; -def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">; -def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">; -def POS_X_FLOAT : SIReg <"POS_X_FLOAT">; -def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">; -def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">; -def POS_W_FLOAT : SIReg <"POS_W_FLOAT">; -def FRONT_FACE : SIReg <"FRONT_FACE">; -def ANCILLARY : SIReg <"ANCILLARY">; -def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">; -def POS_FIXED_PT : SIReg <"POS_FIXED_PT">; - //===----------------------------------------------------------------------===// // Groupings using register classes and tuples //===----------------------------------------------------------------------===// @@ -177,22 +151,22 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SGPR_64, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>; -def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>; +def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>; -def SReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add SGPR_512)>; +def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>; // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>; +def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>; -def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>; +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; -def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>; +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; -def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>; +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>; -def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>; +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; //===----------------------------------------------------------------------===// // [SV]Src_* register classes, can have either an immediate or an register @@ -200,28 +174,9 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>; def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; -def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>; - -def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add VReg_32, SReg_32, - PERSP_SAMPLE_I, PERSP_SAMPLE_J, - PERSP_CENTER_I, PERSP_CENTER_J, - PERSP_CENTROID_I, PERSP_CENTROID_J, - PERSP_I_W, PERSP_J_W, PERSP_1_W, - LINEAR_SAMPLE_I, LINEAR_SAMPLE_J, - LINEAR_CENTER_I, LINEAR_CENTER_J, - LINEAR_CENTROID_I, LINEAR_CENTROID_J, - LINE_STIPPLE_TEX_COORD, - POS_X_FLOAT, - POS_Y_FLOAT, - POS_Z_FLOAT, - POS_W_FLOAT, - FRONT_FACE, - ANCILLARY, - SAMPLE_COVERAGE, - POS_FIXED_PT - ) ->; +def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>; + +def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; -def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>; +def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; |