aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/R600/SILowerControlFlow.cpp
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2012-12-11 21:25:42 +0000
committerTom Stellard <thomas.stellard@amd.com>2012-12-11 21:25:42 +0000
commitf98f2ce29e6e2996fa58f38979143eceaa818335 (patch)
tree86dffe7414e6657874db8ac36e5ddcf7d41b2d9c /lib/Target/R600/SILowerControlFlow.cpp
parent57ac1f458a754f30cf500410b438fb260f9b8fe5 (diff)
downloadexternal_llvm-f98f2ce29e6e2996fa58f38979143eceaa818335.zip
external_llvm-f98f2ce29e6e2996fa58f38979143eceaa818335.tar.gz
external_llvm-f98f2ce29e6e2996fa58f38979143eceaa818335.tar.bz2
Add R600 backend
A new backend supporting AMD GPUs: Radeon HD2XXX - HD7XXX git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169915 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/SILowerControlFlow.cpp')
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp191
1 files changed, 191 insertions, 0 deletions
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
new file mode 100644
index 0000000..277b647
--- /dev/null
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -0,0 +1,191 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
+/// to predicated instructions.
+///
+/// All control flow (except loops) is handled using predicated instructions and
+/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
+/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU). Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// SI_IF_NZ %VCC
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// ELSE
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// ENDIF
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0 // This instruction is an
+/// // optimization which allows us to
+/// // branch if all the bits of
+/// // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1 // Use our branch optimization
+/// // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerControlFlowPass : public MachineFunctionPass {
+
+private:
+ static char ID;
+ const TargetInstrInfo *TII;
+ std::vector<unsigned> PredicateStack;
+ std::vector<unsigned> UnusedRegisters;
+
+ unsigned allocReg();
+ void freeReg(unsigned Reg);
+
+public:
+ SILowerControlFlowPass(TargetMachine &tm) :
+ MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ const char *getPassName() const {
+ return "SI Lower control flow instructions";
+ }
+
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlowPass::ID = 0;
+
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
+ return new SILowerControlFlowPass(tm);
+}
+
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+
+ // Find all the unused registers that can be used for the predicate stack.
+ for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
+ S = AMDGPU::SReg_64RegClass.end();
+ I != S; ++I) {
+ unsigned Reg = *I;
+ if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
+ UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
+ }
+ }
+
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+ BB != BB_E; ++BB) {
+ MachineBasicBlock &MBB = *BB;
+ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+ I != MBB.end(); I = Next) {
+ Next = llvm::next(I);
+ MachineInstr &MI = *I;
+ unsigned Reg;
+ switch (MI.getOpcode()) {
+ default: break;
+ case AMDGPU::SI_IF_NZ:
+ Reg = allocReg();
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+ Reg)
+ .addOperand(MI.getOperand(0)); // VCC
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
+ Reg)
+ .addReg(Reg)
+ .addReg(AMDGPU::EXEC);
+ MI.eraseFromParent();
+ PredicateStack.push_back(Reg);
+ break;
+
+ case AMDGPU::ELSE:
+ Reg = PredicateStack.back();
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+ Reg)
+ .addReg(Reg);
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
+ AMDGPU::EXEC)
+ .addReg(Reg)
+ .addReg(AMDGPU::EXEC);
+ MI.eraseFromParent();
+ break;
+
+ case AMDGPU::ENDIF:
+ Reg = PredicateStack.back();
+ PredicateStack.pop_back();
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
+ AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(Reg);
+ freeReg(Reg);
+
+ if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
+ PredicateStack.empty()) {
+ // If the exec mask is non-zero, skip the next two instructions
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addImm(3)
+ .addReg(AMDGPU::EXEC);
+
+ // Exec mask is zero: Export to NULL target...
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
+ .addImm(0)
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addImm(0)
+ .addImm(1)
+ .addImm(1)
+ .addReg(AMDGPU::SREG_LIT_0)
+ .addReg(AMDGPU::SREG_LIT_0)
+ .addReg(AMDGPU::SREG_LIT_0)
+ .addReg(AMDGPU::SREG_LIT_0);
+
+ // ... and terminate wavefront
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
+ }
+ MI.eraseFromParent();
+ break;
+ }
+ }
+ }
+ return true;
+}
+
+unsigned SILowerControlFlowPass::allocReg() {
+
+ assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
+ unsigned Reg = UnusedRegisters.back();
+ UnusedRegisters.pop_back();
+ return Reg;
+}
+
+void SILowerControlFlowPass::freeReg(unsigned Reg) {
+
+ UnusedRegisters.push_back(Reg);
+}