lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395

//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// When profitable, replace GPR targeting i64 instructions with their
// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
// as minimizing the number of cross-class register copies.
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// TODO: Graph based predicate heuristics.
// Walking the instruction list linearly will get many, perhaps most, of
// the cases, but to do a truly thorough job of this, we need a more
// wholistic approach.
//
// This optimization is very similar in spirit to the register allocator's
// spill placement, only here we're determining where to place cross-class
// register copies rather than spills. As such, a similar approach is
// called for.
//
// We want to build up a set of graphs of all instructions which are candidates
// for transformation along with instructions which generate their inputs and
// consume their outputs. For each edge in the graph, we assign a weight
// based on whether there is a copy required there (weight zero if not) and
// the block frequency of the block containing the defining or using
// instruction, whichever is less. Our optimization is then a graph problem
// to minimize the total weight of all the graphs, then transform instructions
// and add or remove copy instructions as called for to implement the
// solution.
//===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;

#define DEBUG_TYPE "aarch64-simd-scalar"

// Allow forcing all i64 operations with equivalent SIMD instructions to use
// them. For stress-testing the transformation function.
static cl::opt<bool>
TransformAll("aarch64-simd-scalar-force-all",
             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
             cl::init(false), cl::Hidden);

STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");

namespace {
class AArch64AdvSIMDScalar : public MachineFunctionPass {
  MachineRegisterInfo *MRI;
  const AArch64InstrInfo *TII;

private:
  // isProfitableToTransform - Predicate function to determine whether an
  // instruction should be transformed to its equivalent AdvSIMD scalar
  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
  bool isProfitableToTransform(const MachineInstr *MI) const;

  // transformInstruction - Perform the transformation of an instruction
  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
  // to be the correct register class, minimizing cross-class copies.
  void transformInstruction(MachineInstr *MI);

  // processMachineBasicBlock - Main optimzation loop.
  bool processMachineBasicBlock(MachineBasicBlock *MBB);

public:
  static char ID; // Pass identification, replacement for typeid.
  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}

  bool runOnMachineFunction(MachineFunction &F) override;

  const char *getPassName() const override {
    return "AdvSIMD Scalar Operation Optimization";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
};
char AArch64AdvSIMDScalar::ID = 0;
} // end anonymous namespace

static bool isGPR64(unsigned Reg, unsigned SubReg,
                    const MachineRegisterInfo *MRI) {
  if (SubReg)
    return false;
  if (TargetRegisterInfo::isVirtualRegister(Reg))
    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
  return AArch64::GPR64RegClass.contains(Reg);
}

static bool isFPR64(unsigned Reg, unsigned SubReg,
                    const MachineRegisterInfo *MRI) {
  if (TargetRegisterInfo::isVirtualRegister(Reg))
    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
            SubReg == 0) ||
           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
            SubReg == AArch64::dsub);
  // Physical register references just check the register class directly.
  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
}

// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
// copy instruction. Return zero_reg if the instruction is not a copy.
static unsigned getSrcFromCopy(const MachineInstr *MI,
                               const MachineRegisterInfo *MRI,
                               unsigned &SubReg) {
  SubReg = 0;
  // The "FMOV Xd, Dn" instruction is the typical form.
  if (MI->getOpcode() == AArch64::FMOVDXr ||
      MI->getOpcode() == AArch64::FMOVXDr)
    return MI->getOperand(1).getReg();
  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
  // these at this stage, but it's easy to check for.
  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
    SubReg = AArch64::dsub;
    return MI->getOperand(1).getReg();
  }
  // Or just a plain COPY instruction. This can be directly to/from FPR64,
  // or it can be a dsub subreg reference to an FPR128.
  if (MI->getOpcode() == AArch64::COPY) {
    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                MRI) &&
        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
      return MI->getOperand(1).getReg();
    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                MRI) &&
        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
                MRI)) {
      SubReg = MI->getOperand(1).getSubReg();
      return MI->getOperand(1).getReg();
    }
  }

  // Otherwise, this is some other kind of instruction.
  return 0;
}

// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
// that we're considering transforming to, return that AdvSIMD opcode. For all
// others, return the original opcode.
static int getTransformOpcode(unsigned Opc) {
  switch (Opc) {
  default:
    break;
  // FIXME: Lots more possibilities.
  case AArch64::ADDXrr:
    return AArch64::ADDv1i64;
  case AArch64::SUBXrr:
    return AArch64::SUBv1i64;
  case AArch64::ANDXrr:
    return AArch64::ANDv8i8;
  case AArch64::EORXrr:
    return AArch64::EORv8i8;
  case AArch64::ORRXrr:
    return AArch64::ORRv8i8;
  }
  // No AdvSIMD equivalent, so just return the original opcode.
  return Opc;
}

static bool isTransformable(const MachineInstr *MI) {
  int Opc = MI->getOpcode();
  return Opc != getTransformOpcode(Opc);
}

// isProfitableToTransform - Predicate function to determine whether an
// instruction should be transformed to its equivalent AdvSIMD scalar
// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
bool
AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
  // early exit since that's the common case.
  if (!isTransformable(MI))
    return false;

  // Count the number of copies we'll need to add and approximate the number
  // of copies that a transform will enable us to remove.
  unsigned NumNewCopies = 3;
  unsigned NumRemovableCopies = 0;

  unsigned OrigSrc0 = MI->getOperand(1).getReg();
  unsigned OrigSrc1 = MI->getOperand(2).getReg();
  unsigned Src0 = 0, SubReg0;
  unsigned Src1 = 0, SubReg1;
  if (!MRI->def_empty(OrigSrc0)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc0);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
    // If the source was from a copy, we don't need to insert a new copy.
    if (Src0)
      --NumNewCopies;
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
      ++NumRemovableCopies;
  }
  if (!MRI->def_empty(OrigSrc1)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc1);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
    if (Src1)
      --NumNewCopies;
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
      ++NumRemovableCopies;
  }

  // If any of the uses of the original instructions is a cross class copy,
  // that's a copy that will be removable if we transform. Likewise, if
  // any of the uses is a transformable instruction, it's likely the tranforms
  // will chain, enabling us to save a copy there, too. This is an aggressive
  // heuristic that approximates the graph based cost analysis described above.
  unsigned Dst = MI->getOperand(0).getReg();
  bool AllUsesAreCopies = true;
  for (MachineRegisterInfo::use_instr_nodbg_iterator
           Use = MRI->use_instr_nodbg_begin(Dst),
           E = MRI->use_instr_nodbg_end();
       Use != E; ++Use) {
    unsigned SubReg;
    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
      ++NumRemovableCopies;
    // If the use is an INSERT_SUBREG, that's still something that can
    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
    // preferable to have it use the FPR64 in most cases, as if the source
    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
    // Ditto for a lane insert.
    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
             Use->getOpcode() == AArch64::INSvi64gpr)
      ;
    else
      AllUsesAreCopies = false;
  }
  // If all of the uses of the original destination register are copies to
  // FPR64, then we won't end up having a new copy back to GPR64 either.
  if (AllUsesAreCopies)
    --NumNewCopies;

  // If a transform will not increase the number of cross-class copies required,
  // return true.
  if (NumNewCopies <= NumRemovableCopies)
    return true;

  // Finally, even if we otherwise wouldn't transform, check if we're forcing
  // transformation of everything.
  return TransformAll;
}

static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
                                unsigned Dst, unsigned Src, bool IsKill) {
  MachineInstrBuilder MIB =
      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
              Dst)
          .addReg(Src, getKillRegState(IsKill));
  DEBUG(dbgs() << "    adding copy: " << *MIB);
  ++NumCopiesInserted;
  return MIB;
}

// transformInstruction - Perform the transformation of an instruction
// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
// to be the correct register class, minimizing cross-class copies.
void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
  DEBUG(dbgs() << "Scalar transform: " << *MI);

  MachineBasicBlock *MBB = MI->getParent();
  int OldOpc = MI->getOpcode();
  int NewOpc = getTransformOpcode(OldOpc);
  assert(OldOpc != NewOpc && "transform an instruction to itself?!");

  // Check if we need a copy for the source registers.
  unsigned OrigSrc0 = MI->getOperand(1).getReg();
  unsigned OrigSrc1 = MI->getOperand(2).getReg();
  unsigned Src0 = 0, SubReg0;
  unsigned Src1 = 0, SubReg1;
  if (!MRI->def_empty(OrigSrc0)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc0);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
      assert(Src0 && "Can't delete copy w/o a valid original source!");
      Def->eraseFromParent();
      ++NumCopiesDeleted;
    }
  }
  if (!MRI->def_empty(OrigSrc1)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc1);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
      assert(Src1 && "Can't delete copy w/o a valid original source!");
      Def->eraseFromParent();
      ++NumCopiesDeleted;
    }
  }
  // If we weren't able to reference the original source directly, create a
  // copy.
  if (!Src0) {
    SubReg0 = 0;
    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
    insertCopy(TII, MI, Src0, OrigSrc0, true);
  }
  if (!Src1) {
    SubReg1 = 0;
    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
    insertCopy(TII, MI, Src1, OrigSrc1, true);
  }

  // Create a vreg for the destination.
  // FIXME: No need to do this if the ultimate user expects an FPR64.
  // Check for that and avoid the copy if possible.
  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);

  // For now, all of the new instructions have the same simple three-register
  // form, so no need to special case based on what instruction we're
  // building.
  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
      .addReg(Src0, getKillRegState(true), SubReg0)
      .addReg(Src1, getKillRegState(true), SubReg1);

  // Now copy the result back out to a GPR.
  // FIXME: Try to avoid this if all uses could actually just use the FPR64
  // directly.
  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);

  // Erase the old instruction.
  MI->eraseFromParent();

  ++NumScalarInsnsUsed;
}

// processMachineBasicBlock - Main optimzation loop.
bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
  bool Changed = false;
  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
    MachineInstr *MI = I;
    ++I;
    if (isProfitableToTransform(MI)) {
      transformInstruction(MI);
      Changed = true;
    }
  }
  return Changed;
}

// runOnMachineFunction - Pass entry point from PassManager.
bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
  bool Changed = false;
  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");

  const TargetMachine &TM = mf.getTarget();
  MRI = &mf.getRegInfo();
  TII = static_cast<const AArch64InstrInfo *>(
      TM.getSubtargetImpl()->getInstrInfo());

  // Just check things on a one-block-at-a-time basis.
  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
    if (processMachineBasicBlock(I))
      Changed = true;
  return Changed;
}

// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
// to add the pass to the PassManager.
FunctionPass *llvm::createAArch64AdvSIMDScalar() {
  return new AArch64AdvSIMDScalar();
}