aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
blob: 2cf3c2275d6c2eda232a58e3c9cd0a0b96bba2cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// For best-case performance on Cortex-A57, we should try to use a balanced
// mix of odd and even D-registers when performing a critical sequence of
// independent, non-quadword FP/ASIMD floating-point multiply or
// multiply-accumulate operations.
//
// This pass attempts to detect situations where the register allocation may
// adversely affect this load balancing and to change the registers used so as
// to better utilize the CPU.
//
// Ideally we'd just take each multiply or multiply-accumulate in turn and
// allocate it alternating even or odd registers. However, multiply-accumulates
// are most efficiently performed in the same functional unit as their
// accumulation operand. Therefore this pass tries to find maximal sequences
// ("Chains") of multiply-accumulates linked via their accumulation operand,
// and assign them all the same "color" (oddness/evenness).
//
// This optimization affects S-register and D-register floating point
// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
// not affected.
//===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <list>
using namespace llvm;

#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"

// Enforce the algorithm to use the scavenged register even when the original
// destination register is the correct color. Used for testing.
static cl::opt<bool>
TransformAll("aarch64-a57-fp-load-balancing-force-all",
             cl::desc("Always modify dest registers regardless of color"),
             cl::init(false), cl::Hidden);

// Never use the balance information obtained from chains - return a specific
// color always. Used for testing.
static cl::opt<unsigned>
OverrideBalance("aarch64-a57-fp-load-balancing-override",
              cl::desc("Ignore balance information, always return "
                       "(1: Even, 2: Odd)."),
              cl::init(0), cl::Hidden);

//===----------------------------------------------------------------------===//
// Helper functions

// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
static bool isMul(MachineInstr *MI) {
  switch (MI->getOpcode()) {
  case AArch64::FMULSrr:
  case AArch64::FNMULSrr:
  case AArch64::FMULDrr:
  case AArch64::FNMULDrr:
    return true;
  default:
    return false;
  }
}

// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
static bool isMla(MachineInstr *MI) {
  switch (MI->getOpcode()) {
  case AArch64::FMSUBSrrr:
  case AArch64::FMADDSrrr:
  case AArch64::FNMSUBSrrr:
  case AArch64::FNMADDSrrr:
  case AArch64::FMSUBDrrr:
  case AArch64::FMADDDrrr:
  case AArch64::FNMSUBDrrr:
  case AArch64::FNMADDDrrr:
    return true;
  default:
    return false;
  }
}

namespace llvm {
static void initializeAArch64A57FPLoadBalancingPass(PassRegistry &);
}

//===----------------------------------------------------------------------===//

namespace {
/// A "color", which is either even or odd. Yes, these aren't really colors
/// but the algorithm is conceptually doing two-color graph coloring.
enum class Color { Even, Odd };
#ifndef NDEBUG
static const char *ColorNames[2] = { "Even", "Odd" };
#endif

class Chain;

class AArch64A57FPLoadBalancing : public MachineFunctionPass {
  MachineRegisterInfo *MRI;
  const TargetRegisterInfo *TRI;
  RegisterClassInfo RCI;

public:
  static char ID;
  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
    initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &F) override;

  const char *getPassName() const override {
    return "A57 FP Anti-dependency breaker";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

private:
  bool runOnBasicBlock(MachineBasicBlock &MBB);
  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
                     int &Balance);
  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
  void scanInstruction(MachineInstr *MI, unsigned Idx,
                       std::map<unsigned, Chain*> &Active,
                       std::set<std::unique_ptr<Chain>> &AllChains);
  void maybeKillChain(MachineOperand &MO, unsigned Idx,
                      std::map<unsigned, Chain*> &RegChains);
  Color getColor(unsigned Register);
  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
};
}

char AArch64A57FPLoadBalancing::ID = 0;

INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                      "AArch64 A57 FP Load-Balancing", false, false)
INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                    "AArch64 A57 FP Load-Balancing", false, false)

namespace {
/// A Chain is a sequence of instructions that are linked together by 
/// an accumulation operand. For example:
///
///   fmul d0<def>, ?
///   fmla d1<def>, ?, ?, d0<kill>
///   fmla d2<def>, ?, ?, d1<kill>
///
/// There may be other instructions interleaved in the sequence that
/// do not belong to the chain. These other instructions must not use
/// the "chain" register at any point.
///
/// We currently only support chains where the "chain" operand is killed
/// at each link in the chain for simplicity.
/// A chain has three important instructions - Start, Last and Kill.
///   * The start instruction is the first instruction in the chain.
///   * Last is the final instruction in the chain.
///   * Kill may or may not be defined. If defined, Kill is the instruction
///     where the outgoing value of the Last instruction is killed.
///     This information is important as if we know the outgoing value is
///     killed with no intervening uses, we can safely change its register.
///
/// Without a kill instruction, we must assume the outgoing value escapes
/// beyond our model and either must not change its register or must
/// create a fixup FMOV to keep the old register value consistent.
///
class Chain {
public:
  /// The important (marker) instructions.
  MachineInstr *StartInst, *LastInst, *KillInst;
  /// The index, from the start of the basic block, that each marker
  /// appears. These are stored so we can do quick interval tests.
  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
  /// All instructions in the chain.
  std::set<MachineInstr*> Insts;
  /// True if KillInst cannot be modified. If this is true,
  /// we cannot change LastInst's outgoing register.
  /// This will be true for tied values and regmasks.
  bool KillIsImmutable;
  /// The "color" of LastInst. This will be the preferred chain color,
  /// as changing intermediate nodes is easy but changing the last
  /// instruction can be more tricky.
  Color LastColor;

  Chain(MachineInstr *MI, unsigned Idx, Color C)
      : StartInst(MI), LastInst(MI), KillInst(nullptr),
        StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
        LastColor(C) {
    Insts.insert(MI);
  }

  /// Add a new instruction into the chain. The instruction's dest operand
  /// has the given color.
  void add(MachineInstr *MI, unsigned Idx, Color C) {
    LastInst = MI;
    LastInstIdx = Idx;
    LastColor = C;
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
           "Chain: broken invariant. A Chain can only be killed after its last "
           "def");

    Insts.insert(MI);
  }

  /// Return true if MI is a member of the chain.
  bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }

  /// Return the number of instructions in the chain.
  unsigned size() const {
    return Insts.size();
  }

  /// Inform the chain that its last active register (the dest register of
  /// LastInst) is killed by MI with no intervening uses or defs.
  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
    KillInst = MI;
    KillInstIdx = Idx;
    KillIsImmutable = Immutable;
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
           "Chain: broken invariant. A Chain can only be killed after its last "
           "def");
  }

  /// Return the first instruction in the chain.
  MachineInstr *getStart() const { return StartInst; }
  /// Return the last instruction in the chain.
  MachineInstr *getLast() const { return LastInst; }
  /// Return the "kill" instruction (as set with setKill()) or NULL.
  MachineInstr *getKill() const { return KillInst; }
  /// Return an instruction that can be used as an iterator for the end
  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
  MachineBasicBlock::iterator getEnd() const {
    return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
  }

  /// Can the Kill instruction (assuming one exists) be modified?
  bool isKillImmutable() const { return KillIsImmutable; }

  /// Return the preferred color of this chain.
  Color getPreferredColor() {
    if (OverrideBalance != 0)
      return OverrideBalance == 1 ? Color::Even : Color::Odd;
    return LastColor;
  }

  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
  bool rangeOverlapsWith(const Chain &Other) const {
    unsigned End = KillInst ? KillInstIdx : LastInstIdx;
    unsigned OtherEnd = Other.KillInst ?
      Other.KillInstIdx : Other.LastInstIdx;

    return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
  }

  /// Return true if this chain starts before Other.
  bool startsBefore(const Chain *Other) const {
    return StartInstIdx < Other->StartInstIdx;
  }

  /// Return true if the group will require a fixup MOV at the end.
  bool requiresFixup() const {
    return (getKill() && isKillImmutable()) || !getKill();
  }

  /// Return a simple string representation of the chain.
  std::string str() const {
    std::string S;
    raw_string_ostream OS(S);
    
    OS << "{";
    StartInst->print(OS, NULL, true);
    OS << " -> ";
    LastInst->print(OS, NULL, true);
    if (KillInst) {
      OS << " (kill @ ";
      KillInst->print(OS, NULL, true);
      OS << ")";
    }
    OS << "}";

    return OS.str();
  }

};

} // end anonymous namespace

//===----------------------------------------------------------------------===//

bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
  bool Changed = false;
  DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");

  MRI = &F.getRegInfo();
  TRI = F.getRegInfo().getTargetRegisterInfo();
  RCI.runOnMachineFunction(F);

  for (auto &MBB : F) {
    Changed |= runOnBasicBlock(MBB);
  }

  return Changed;
}

bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
  bool Changed = false;
  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");

  // First, scan the basic block producing a set of chains.

  // The currently "active" chains - chains that can be added to and haven't
  // been killed yet. This is keyed by register - all chains can only have one
  // "link" register between each inst in the chain.
  std::map<unsigned, Chain*> ActiveChains;
  std::set<std::unique_ptr<Chain>> AllChains;
  unsigned Idx = 0;
  for (auto &MI : MBB)
    scanInstruction(&MI, Idx++, ActiveChains, AllChains);

  DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");

  // Group the chains into disjoint sets based on their liveness range. This is
  // a poor-man's version of graph coloring. Ideally we'd create an interference
  // graph and perform full-on graph coloring on that, but;
  //   (a) That's rather heavyweight for only two colors.
  //   (b) We expect multiple disjoint interference regions - in practice the live
  //       range of chains is quite small and they are clustered between loads
  //       and stores.
  EquivalenceClasses<Chain*> EC;
  for (auto &I : AllChains)
    EC.insert(I.get());

  for (auto &I : AllChains)
    for (auto &J : AllChains)
      if (I != J && I->rangeOverlapsWith(*J))
        EC.unionSets(I.get(), J.get());
  DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");

  // Now we assume that every member of an equivalence class interferes
  // with every other member of that class, and with no members of other classes.

  // Convert the EquivalenceClasses to a simpler set of sets.
  std::vector<std::vector<Chain*> > V;
  for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
    if (Cs.empty()) continue;
    V.push_back(std::move(Cs));
  }

  // Now we have a set of sets, order them by start address so
  // we can iterate over them sequentially.
  std::sort(V.begin(), V.end(),
            [](const std::vector<Chain*> &A,
               const std::vector<Chain*> &B) {
      return A.front()->startsBefore(B.front());
    });

  // As we only have two colors, we can track the global (BB-level) balance of
  // odds versus evens. We aim to keep this near zero to keep both execution
  // units fed.
  // Positive means we're even-heavy, negative we're odd-heavy.
  //
  // FIXME: If chains have interdependencies, for example:
  //   mul r0, r1, r2
  //   mul r3, r0, r1
  // We do not model this and may color each one differently, assuming we'll
  // get ILP when we obviously can't. This hasn't been seen to be a problem
  // in practice so far, so we simplify the algorithm by ignoring it.
  int Parity = 0;

  for (auto &I : V)
    Changed |= colorChainSet(std::move(I), MBB, Parity);

  return Changed;
}

Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
                                                  std::vector<Chain*> &L) {
  if (L.empty())
    return nullptr;

  // We try and get the best candidate from L to color next, given that our
  // preferred color is "PreferredColor". L is ordered from larger to smaller
  // chains. It is beneficial to color the large chains before the small chains,
  // but if we can't find a chain of the maximum length with the preferred color,
  // we fuzz the size and look for slightly smaller chains before giving up and
  // returning a chain that must be recolored.

  // FIXME: Does this need to be configurable?
  const unsigned SizeFuzz = 1;
  unsigned MinSize = L.front()->size() - SizeFuzz;
  for (auto I = L.begin(), E = L.end(); I != E; ++I) {
    if ((*I)->size() <= MinSize) {
      // We've gone past the size limit. Return the previous item.
      Chain *Ch = *--I;
      L.erase(I);
      return Ch;
    }

    if ((*I)->getPreferredColor() == PreferredColor) {
      Chain *Ch = *I;
      L.erase(I);
      return Ch;
    }
  }
  
  // Bailout case - just return the first item.
  Chain *Ch = L.front();
  L.erase(L.begin());
  return Ch;
}

bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
                                              MachineBasicBlock &MBB,
                                              int &Parity) {
  bool Changed = false;
  DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");

  // Sort by descending size order so that we allocate the most important
  // sets first.
  // Tie-break equivalent sizes by sorting chains requiring fixups before
  // those without fixups. The logic here is that we should look at the
  // chains that we cannot change before we look at those we can,
  // so the parity counter is updated and we know what color we should
  // change them to!
  // Final tie-break with instruction order so pass output is stable (i.e. not
  // dependent on malloc'd pointer values).
  std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
      if (G1->size() != G2->size())
        return G1->size() > G2->size();
      if (G1->requiresFixup() != G2->requiresFixup())
        return G1->requiresFixup() > G2->requiresFixup();
      // Make sure startsBefore() produces a stable final order.
      assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
             "Starts before not total order!");
      return G1->startsBefore(G2);
    });

  Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
    // Start off by assuming we'll color to our own preferred color.
    Color C = PreferredColor;
    if (Parity == 0)
      // But if we really don't care, use the chain's preferred color.
      C = G->getPreferredColor();

    DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
          << ColorNames[(int)C] << "\n");

    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
    // happens infrequently and when it does it has at least a 50% chance of
    // slowing code down instead of speeding it up.
    if (G->requiresFixup() && C != G->getPreferredColor()) {
      C = G->getPreferredColor();
      DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
            "color remains " << ColorNames[(int)C] << "\n");
    }

    Changed |= colorChain(G, C, MBB);

    Parity += (C == Color::Even) ? G->size() : -G->size();
    PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
  }

  return Changed;
}

int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                MachineBasicBlock &MBB) {
  RegScavenger RS;
  RS.enterBasicBlock(&MBB);
  RS.forward(MachineBasicBlock::iterator(G->getStart()));

  // Can we find an appropriate register that is available throughout the life 
  // of the chain?
  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
       I != E; ++I) {
    RS.forward(I);
    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));

    // Remove any registers clobbered by a regmask or any def register that is
    // immediately dead.
    for (auto J : I->operands()) {
      if (J.isRegMask())
        AvailableRegs.clearBitsNotInMask(J.getRegMask());

      if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
        assert(J.isDead() && "Non-dead def should have been removed by now!");
        AvailableRegs.reset(J.getReg());
      }
    }
  }

  // Make sure we allocate in-order, to get the cheapest registers first.
  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
  for (auto Reg : Ord) {
    if (!AvailableRegs[Reg])
      continue;
    if ((C == Color::Even && (Reg % 2) == 0) ||
        (C == Color::Odd && (Reg % 2) == 1))
      return Reg;
  }

  return -1;
}

bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
                                           MachineBasicBlock &MBB) {
  bool Changed = false;
  DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
        << ColorNames[(int)C] << ")\n");

  // Try and obtain a free register of the right class. Without a register
  // to play with we cannot continue.
  int Reg = scavengeRegister(G, C, MBB);
  if (Reg == -1) {
    DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
    return false;
  }
  DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");

  std::map<unsigned, unsigned> Substs;
  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
       I != E; ++I) {
    if (!G->contains(I) &&
        (&*I != G->getKill() || G->isKillImmutable()))
      continue;

    // I is a member of G, or I is a mutable instruction that kills G.

    std::vector<unsigned> ToErase;
    for (auto &U : I->operands()) {
      if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
        unsigned OrigReg = U.getReg();
        U.setReg(Substs[OrigReg]);
        if (U.isKill())
          // Don't erase straight away, because there may be other operands
          // that also reference this substitution!
          ToErase.push_back(OrigReg);
      } else if (U.isRegMask()) {
        for (auto J : Substs) {
          if (U.clobbersPhysReg(J.first))
            ToErase.push_back(J.first);
        }
      }
    }
    // Now it's safe to remove the substs identified earlier.
    for (auto J : ToErase)
      Substs.erase(J);

    // Only change the def if this isn't the last instruction.
    if (&*I != G->getKill()) {
      MachineOperand &MO = I->getOperand(0);

      bool Change = TransformAll || getColor(MO.getReg()) != C;
      if (G->requiresFixup() && &*I == G->getLast())
        Change = false;

      if (Change) {
        Substs[MO.getReg()] = Reg;
        MO.setReg(Reg);
        MRI->setPhysRegUsed(Reg);

        Changed = true;
      }
    }
  }
  assert(Substs.size() == 0 && "No substitutions should be left active!");

  if (G->getKill()) {
    DEBUG(dbgs() << " - Kill instruction seen.\n");
  } else {
    // We didn't have a kill instruction, but we didn't seem to need to change
    // the destination register anyway.
    DEBUG(dbgs() << " - Destination register not changed.\n");
  }
  return Changed;
}

void AArch64A57FPLoadBalancing::
scanInstruction(MachineInstr *MI, unsigned Idx, 
                std::map<unsigned, Chain*> &ActiveChains,
                std::set<std::unique_ptr<Chain>> &AllChains) {
  // Inspect "MI", updating ActiveChains and AllChains.

  if (isMul(MI)) {

    for (auto &I : MI->uses())
      maybeKillChain(I, Idx, ActiveChains);
    for (auto &I : MI->defs())
      maybeKillChain(I, Idx, ActiveChains);

    // Create a new chain. Multiplies don't require forwarding so can go on any
    // unit.
    unsigned DestReg = MI->getOperand(0).getReg();

    DEBUG(dbgs() << "New chain started for register "
          << TRI->getName(DestReg) << " at " << *MI);

    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
    ActiveChains[DestReg] = G.get();
    AllChains.insert(std::move(G));

  } else if (isMla(MI)) {

    // It is beneficial to keep MLAs on the same functional unit as their
    // accumulator operand.
    unsigned DestReg  = MI->getOperand(0).getReg();
    unsigned AccumReg = MI->getOperand(3).getReg();

    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
    if (DestReg != AccumReg)
      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);

    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
      DEBUG(dbgs() << "Chain found for accumulator register "
            << TRI->getName(AccumReg) << " in MI " << *MI);

      // For simplicity we only chain together sequences of MULs/MLAs where the
      // accumulator register is killed on each instruction. This means we don't
      // need to track other uses of the registers we want to rewrite.
      //
      // FIXME: We could extend to handle the non-kill cases for more coverage.
      if (MI->getOperand(3).isKill()) {
        // Add to chain.
        DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
        // Handle cases where the destination is not the same as the accumulator.
        if (DestReg != AccumReg) {
          ActiveChains[DestReg] = ActiveChains[AccumReg];
          ActiveChains.erase(AccumReg);
        }
        return;
      }

      DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
            << "marked <kill>!\n");
      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
    }

    DEBUG(dbgs() << "Creating new chain for dest register "
          << TRI->getName(DestReg) << "\n");
    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
    ActiveChains[DestReg] = G.get();
    AllChains.insert(std::move(G));

  } else {

    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
    // lists.
    for (auto &I : MI->uses())
      maybeKillChain(I, Idx, ActiveChains);
    for (auto &I : MI->defs())
      maybeKillChain(I, Idx, ActiveChains);

  }
}

void AArch64A57FPLoadBalancing::
maybeKillChain(MachineOperand &MO, unsigned Idx,
               std::map<unsigned, Chain*> &ActiveChains) {
  // Given an operand and the set of active chains (keyed by register),
  // determine if a chain should be ended and remove from ActiveChains.
  MachineInstr *MI = MO.getParent();

  if (MO.isReg()) {

    // If this is a KILL of a current chain, record it.
    if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
      DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
            << "\n");
      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
    }
    ActiveChains.erase(MO.getReg());

  } else if (MO.isRegMask()) {

    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
         I != E;) {
      if (MO.clobbersPhysReg(I->first)) {
        DEBUG(dbgs() << "Kill (regmask) seen for chain "
              << TRI->getName(I->first) << "\n");
        I->second->setKill(MI, Idx, /*Immutable=*/true);
        ActiveChains.erase(I++);
      } else
        ++I;
    }

  }
}

Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
  if ((TRI->getEncodingValue(Reg) % 2) == 0)
    return Color::Even;
  else
    return Color::Odd;
}

// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
  return new AArch64A57FPLoadBalancing();
}