lib/Target/R600/SILowerControlFlow.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191

//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
/// to predicated instructions.
///
/// All control flow (except loops) is handled using predicated instructions and
/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
/// by writting to the 64-bit EXEC register (each bit corresponds to a
/// single vector ALU).  Typically, for predicates, a vector ALU will write
/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
/// Vector ALU) and then the ScalarALU will AND the VCC register with the
/// EXEC to update the predicates.
///
/// For example:
/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
/// SI_IF_NZ %VCC
///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
/// ELSE
///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
/// ENDIF
///
/// becomes:
///
/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
/// S_CBRANCH_EXECZ label0            // This instruction is an
///                                   // optimization which allows us to
///                                   // branch if all the bits of
///                                   // EXEC are zero.
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
///
/// label0:
/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
/// S_BRANCH_EXECZ label1              // Use our branch optimization
///                                    // instruction again.
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
/// label1:
/// %EXEC = S_OR_B64 %EXEC, %SGPR2     // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

using namespace llvm;

namespace {

class SILowerControlFlowPass : public MachineFunctionPass {

private:
  static char ID;
  const TargetInstrInfo *TII;
  std::vector<unsigned> PredicateStack;
  std::vector<unsigned> UnusedRegisters;

  unsigned allocReg();
  void freeReg(unsigned Reg);

public:
  SILowerControlFlowPass(TargetMachine &tm) :
    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }

  virtual bool runOnMachineFunction(MachineFunction &MF);

  const char *getPassName() const {
    return "SI Lower control flow instructions";
  }

};

} // End anonymous namespace

char SILowerControlFlowPass::ID = 0;

FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
  return new SILowerControlFlowPass(tm);
}

bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {

  // Find all the unused registers that can be used for the predicate stack.
  for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
                                     S = AMDGPU::SReg_64RegClass.end();
                                     I != S; ++I) {
    unsigned Reg = *I;
    if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
      UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
    }
  }

  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                  BB != BB_E; ++BB) {
    MachineBasicBlock &MBB = *BB;
    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
                               I != MBB.end(); I = Next) {
      Next = llvm::next(I);
      MachineInstr &MI = *I;
      unsigned Reg;
      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF_NZ:
          Reg = allocReg();
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
                  Reg)
                  .addOperand(MI.getOperand(0)); // VCC
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
                  Reg)
                  .addReg(Reg)
                  .addReg(AMDGPU::EXEC);
          MI.eraseFromParent();
          PredicateStack.push_back(Reg);
          break;

        case AMDGPU::ELSE:
          Reg = PredicateStack.back();
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
                  Reg)
                  .addReg(Reg);
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
                  AMDGPU::EXEC)
                  .addReg(Reg)
                  .addReg(AMDGPU::EXEC);
          MI.eraseFromParent();
          break;

        case AMDGPU::ENDIF:
          Reg = PredicateStack.back();
          PredicateStack.pop_back();
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
                  AMDGPU::EXEC)
                  .addReg(AMDGPU::EXEC)
                  .addReg(Reg);
          freeReg(Reg);

          if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
              PredicateStack.empty()) {
            // If the exec mask is non-zero, skip the next two instructions
            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
                    .addImm(3)
                    .addReg(AMDGPU::EXEC);

            // Exec mask is zero: Export to NULL target...
            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
                    .addImm(0)
                    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
                    .addImm(0)
                    .addImm(1)
                    .addImm(1)
                    .addReg(AMDGPU::SREG_LIT_0)
                    .addReg(AMDGPU::SREG_LIT_0)
                    .addReg(AMDGPU::SREG_LIT_0)
                    .addReg(AMDGPU::SREG_LIT_0);

            // ... and terminate wavefront
            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
          }
          MI.eraseFromParent();
          break;
      }
    }
  }
  return true;
}

unsigned SILowerControlFlowPass::allocReg() {

  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
  unsigned Reg = UnusedRegisters.back();
  UnusedRegisters.pop_back();
  return Reg;
}

void SILowerControlFlowPass::freeReg(unsigned Reg) {

  UnusedRegisters.push_back(Reg);
}