diff options
Diffstat (limited to 'lib/Target/R600/AMDGPUInstrInfo.cpp')
-rw-r--r-- | lib/Target/R600/AMDGPUInstrInfo.cpp | 56 |
1 files changed, 29 insertions, 27 deletions
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index fef5b8c..a8fc614 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -86,21 +86,6 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, - MachineBasicBlock &MBB) const { - while (iter != MBB.end()) { - switch (iter->getOpcode()) { - default: - break; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: - case AMDGPU::BRANCH: - return true; - }; - ++iter; - } - return false; -} void AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -147,7 +132,6 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const } else if (isRegisterStore(*MI)) { int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val); - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); @@ -215,15 +199,30 @@ AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, return 0; } -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const { - assert(Offset2 > Offset1 - && "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 16, - // then schedule together. - // TODO: Make the loads schedule near if it fits in a cacheline - return (NumLoads < 16 && (Offset2 - Offset1) < 16); +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } bool @@ -320,7 +319,10 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + Offset = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } @@ -335,7 +337,7 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { } // Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned +// header files, so we need to wrap it in a function that takes unsigned // instead. namespace llvm { namespace AMDGPU { |