aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/ARM/A15SDOptimizer.cpp
blob: 7a1865ce5fd6bbdd66e1a73e7886ed0659eb0b12 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// The Cortex-A15 processor employs a tracking scheme in its register renaming
// in order to process each instruction's micro-ops speculatively and
// out-of-order with appropriate forwarding. The ARM architecture allows VFP
// instructions to read and write 32-bit S-registers.  Each S-register
// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
//
// There are several instruction patterns which can be used to provide this
// capability which can provide higher performance than other, potentially more
// direct patterns, specifically around when one micro-op reads a D-register
// operand that has recently been written as one or more S-register results.
//
// This file defines a pre-regalloc pass which looks for SPR producers which
// are going to be used by a DPR (or QPR) consumers and creates the more
// optimized access pattern.
//
//===----------------------------------------------------------------------===//

#include "ARM.h"
#include "ARMBaseInstrInfo.h"
#include "ARMBaseRegisterInfo.h"
#include "ARMSubtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <map>
#include <set>

using namespace llvm;

#define DEBUG_TYPE "a15-sd-optimizer"

namespace {
  struct A15SDOptimizer : public MachineFunctionPass {
    static char ID;
    A15SDOptimizer() : MachineFunctionPass(ID) {}

    bool runOnMachineFunction(MachineFunction &Fn) override;

    const char *getPassName() const override {
      return "ARM A15 S->D optimizer";
    }

  private:
    const ARMBaseInstrInfo *TII;
    const TargetRegisterInfo *TRI;
    MachineRegisterInfo *MRI;

    bool runOnInstruction(MachineInstr *MI);

    //
    // Instruction builder helpers
    //
    unsigned createDupLane(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator InsertBefore,
                           DebugLoc DL,
                           unsigned Reg, unsigned Lane,
                           bool QPR=false);

    unsigned createExtractSubreg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator InsertBefore,
                                 DebugLoc DL,
                                 unsigned DReg, unsigned Lane,
                                 const TargetRegisterClass *TRC);

    unsigned createVExt(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator InsertBefore,
                        DebugLoc DL,
                        unsigned Ssub0, unsigned Ssub1);

    unsigned createRegSequence(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator InsertBefore,
                               DebugLoc DL,
                               unsigned Reg1, unsigned Reg2);

    unsigned createInsertSubreg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
                                DebugLoc DL, unsigned DReg, unsigned Lane,
                                unsigned ToInsert);

    unsigned createImplicitDef(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator InsertBefore,
                               DebugLoc DL);

    //
    // Various property checkers
    //
    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
    bool hasPartialWrite(MachineInstr *MI);
    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
    unsigned getDPRLaneFromSPR(unsigned SReg);

    //
    // Methods used for getting the definitions of partial registers
    //

    MachineInstr *elideCopies(MachineInstr *MI);
    void elideCopiesAndPHIs(MachineInstr *MI,
                            SmallVectorImpl<MachineInstr*> &Outs);

    //
    // Pattern optimization methods
    //
    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
    unsigned optimizeSDPattern(MachineInstr *MI);
    unsigned getPrefSPRLane(unsigned SReg);

    //
    // Sanitizing method - used to make sure if don't leave dead code around.
    //
    void eraseInstrWithNoUses(MachineInstr *MI);

    //
    // A map used to track the changes done by this pass.
    //
    std::map<MachineInstr*, unsigned> Replacements;
    std::set<MachineInstr *> DeadInstr;
  };
  char A15SDOptimizer::ID = 0;
} // end anonymous namespace

// Returns true if this is a use of a SPR register.
bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
                                  const TargetRegisterClass *TRC) {
  if (!MO.isReg())
    return false;
  unsigned Reg = MO.getReg();

  if (TargetRegisterInfo::isVirtualRegister(Reg))
    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
  else
    return TRC->contains(Reg);
}

unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
                                           &ARM::DPRRegClass);
  if (DReg != ARM::NoRegister) return ARM::ssub_1;
  return ARM::ssub_0;
}

// Get the subreg type that is most likely to be coalesced
// for an SPR register that will be used in VDUP32d pseudo.
unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
  if (!TRI->isVirtualRegister(SReg))
    return getDPRLaneFromSPR(SReg);

  MachineInstr *MI = MRI->getVRegDef(SReg);
  if (!MI) return ARM::ssub_0;
  MachineOperand *MO = MI->findRegisterDefOperand(SReg);

  assert(MO->isReg() && "Non-register operand found!");
  if (!MO) return ARM::ssub_0;

  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
                                    &ARM::SPRRegClass)) {
    SReg = MI->getOperand(1).getReg();
  }

  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
    return ARM::ssub_0;
  }
  return getDPRLaneFromSPR(SReg);
}

// MI is known to be dead. Figure out what instructions
// are also made dead by this and mark them for removal.
void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
  SmallVector<MachineInstr *, 8> Front;
  DeadInstr.insert(MI);

  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
  Front.push_back(MI);

  while (Front.size() != 0) {
    MI = Front.back();
    Front.pop_back();

    // MI is already known to be dead. We need to see
    // if other instructions can also be removed.
    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
      MachineOperand &MO = MI->getOperand(i);
      if ((!MO.isReg()) || (!MO.isUse()))
        continue;
      unsigned Reg = MO.getReg();
      if (!TRI->isVirtualRegister(Reg))
        continue;
      MachineOperand *Op = MI->findRegisterDefOperand(Reg);

      if (!Op)
        continue;

      MachineInstr *Def = Op->getParent();

      // We don't need to do anything if we have already marked
      // this instruction as being dead.
      if (DeadInstr.find(Def) != DeadInstr.end())
        continue;

      // Check if all the uses of this instruction are marked as
      // dead. If so, we can also mark this instruction as being
      // dead.
      bool IsDead = true;
      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
        MachineOperand &MODef = Def->getOperand(j);
        if ((!MODef.isReg()) || (!MODef.isDef()))
          continue;
        unsigned DefReg = MODef.getReg();
        if (!TRI->isVirtualRegister(DefReg)) {
          IsDead = false;
          break;
        }
        for (MachineRegisterInfo::use_instr_iterator
             II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end();
             II != EE; ++II) {
          // We don't care about self references.
          if (&*II == Def)
            continue;
          if (DeadInstr.find(&*II) == DeadInstr.end()) {
            IsDead = false;
            break;
          }
        }
      }

      if (!IsDead) continue;

      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
      DeadInstr.insert(Def);
    }
  }
}

// Creates the more optimized patterns and generally does all the code
// transformations in this pass.
unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
  if (MI->isCopy()) {
    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
  }

  if (MI->isInsertSubreg()) {
    unsigned DPRReg = MI->getOperand(1).getReg();
    unsigned SPRReg = MI->getOperand(2).getReg();

    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());

      if (DPRMI && SPRMI) {
        // See if the first operand of this insert_subreg is IMPLICIT_DEF
        MachineInstr *ECDef = elideCopies(DPRMI);
        if (ECDef && ECDef->isImplicitDef()) {
          // Another corner case - if we're inserting something that is purely
          // a subreg copy of a DPR, just use that DPR.

          MachineInstr *EC = elideCopies(SPRMI);
          // Is it a subreg copy of ssub_0?
          if (EC && EC->isCopy() &&
              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);

            // Find the thing we're subreg copying out of - is it of the same
            // regclass as DPRMI? (i.e. a DPR or QPR).
            unsigned FullReg = SPRMI->getOperand(1).getReg();
            const TargetRegisterClass *TRC =
              MRI->getRegClass(MI->getOperand(1).getReg());
            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
              eraseInstrWithNoUses(MI);
              return FullReg;
            }
          }

          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
        }
      }
    }
    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
  }

  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
                                          &ARM::SPRRegClass)) {
    // See if all bar one of the operands are IMPLICIT_DEF and insert the
    // optimizer pattern accordingly.
    unsigned NumImplicit = 0, NumTotal = 0;
    unsigned NonImplicitReg = ~0U;

    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
      if (!MI->getOperand(I).isReg())
        continue;
      ++NumTotal;
      unsigned OpReg = MI->getOperand(I).getReg();

      if (!TRI->isVirtualRegister(OpReg))
        break;

      MachineInstr *Def = MRI->getVRegDef(OpReg);
      if (!Def)
        break;
      if (Def->isImplicitDef())
        ++NumImplicit;
      else
        NonImplicitReg = MI->getOperand(I).getReg();
    }

    if (NumImplicit == NumTotal - 1)
      return optimizeAllLanesPattern(MI, NonImplicitReg);
    else
      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
  }

  llvm_unreachable("Unhandled update pattern!");
}

// Return true if this MachineInstr inserts a scalar (SPR) value into
// a D or Q register.
bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
  // The only way we can do a partial register update is through a COPY,
  // INSERT_SUBREG or REG_SEQUENCE.
  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
    return true;

  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
                                           &ARM::SPRRegClass))
    return true;

  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
    return true;

  return false;
}

// Looks through full copies to get the instruction that defines the input
// operand for MI.
MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
  if (!MI->isFullCopy())
    return MI;
  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
    return nullptr;
  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
  if (!Def)
    return nullptr;
  return elideCopies(Def);
}

// Look through full copies and PHIs to get the set of non-copy MachineInstrs
// that can produce MI.
void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
                                        SmallVectorImpl<MachineInstr*> &Outs) {
   // Looking through PHIs may create loops so we need to track what
   // instructions we have visited before.
   std::set<MachineInstr *> Reached;
   SmallVector<MachineInstr *, 8> Front;
   Front.push_back(MI);
   while (Front.size() != 0) {
     MI = Front.back();
     Front.pop_back();

     // If we have already explored this MachineInstr, ignore it.
     if (Reached.find(MI) != Reached.end())
       continue;
     Reached.insert(MI);
     if (MI->isPHI()) {
       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
         unsigned Reg = MI->getOperand(I).getReg();
         if (!TRI->isVirtualRegister(Reg)) {
           continue;
         }
         MachineInstr *NewMI = MRI->getVRegDef(Reg);
         if (!NewMI)
           continue;
         Front.push_back(NewMI);
       }
     } else if (MI->isFullCopy()) {
       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
         continue;
       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
       if (!NewMI)
         continue;
       Front.push_back(NewMI);
     } else {
       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
       Outs.push_back(MI);
     }
   }
}

// Return the DPR virtual registers that are read by this machine instruction
// (if any).
SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
      MI->isKill())
    return SmallVector<unsigned, 8>();

  SmallVector<unsigned, 8> Defs;
  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
    MachineOperand &MO = MI->getOperand(i);

    if (!MO.isReg() || !MO.isUse())
      continue;
    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
        !usesRegClass(MO, &ARM::QPRRegClass) &&
        !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
      continue;

    Defs.push_back(MO.getReg());
  }
  return Defs;
}

// Creates a DPR register from an SPR one by using a VDUP.
unsigned
A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator InsertBefore,
                              DebugLoc DL,
                              unsigned Reg, unsigned Lane, bool QPR) {
  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
                                                  &ARM::DPRRegClass);
  AddDefaultPred(BuildMI(MBB,
                         InsertBefore,
                         DL,
                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
                         Out)
                   .addReg(Reg)
                   .addImm(Lane));

  return Out;
}

// Creates a SPR register from a DPR by copying the value in lane 0.
unsigned
A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator InsertBefore,
                                    DebugLoc DL,
                                    unsigned DReg, unsigned Lane,
                                    const TargetRegisterClass *TRC) {
  unsigned Out = MRI->createVirtualRegister(TRC);
  BuildMI(MBB,
          InsertBefore,
          DL,
          TII->get(TargetOpcode::COPY), Out)
    .addReg(DReg, 0, Lane);

  return Out;
}

// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
unsigned
A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertBefore,
                                  DebugLoc DL,
                                  unsigned Reg1, unsigned Reg2) {
  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
  BuildMI(MBB,
          InsertBefore,
          DL,
          TII->get(TargetOpcode::REG_SEQUENCE), Out)
    .addReg(Reg1)
    .addImm(ARM::dsub_0)
    .addReg(Reg2)
    .addImm(ARM::dsub_1);
  return Out;
}

// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
// and merges them into one DPR register.
unsigned
A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator InsertBefore,
                           DebugLoc DL,
                           unsigned Ssub0, unsigned Ssub1) {
  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
  AddDefaultPred(BuildMI(MBB,
                         InsertBefore,
                         DL,
                         TII->get(ARM::VEXTd32), Out)
                   .addReg(Ssub0)
                   .addReg(Ssub1)
                   .addImm(1));
  return Out;
}

unsigned
A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator InsertBefore,
                                   DebugLoc DL, unsigned DReg, unsigned Lane,
                                   unsigned ToInsert) {
  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
  BuildMI(MBB,
          InsertBefore,
          DL,
          TII->get(TargetOpcode::INSERT_SUBREG), Out)
    .addReg(DReg)
    .addReg(ToInsert)
    .addImm(Lane);

  return Out;
}

unsigned
A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator InsertBefore,
                                  DebugLoc DL) {
  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
  BuildMI(MBB,
          InsertBefore,
          DL,
          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
  return Out;
}

// This function inserts instructions in order to optimize interactions between
// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
// lanes, and the using VEXT instructions to recompose the result.
unsigned
A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
  MachineBasicBlock::iterator InsertPt(MI);
  DebugLoc DL = MI->getDebugLoc();
  MachineBasicBlock &MBB = *MI->getParent();
  InsertPt++;
  unsigned Out;

  // DPair has the same length as QPR and also has two DPRs as subreg.
  // Treat DPair as QPR.
  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
      MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
                                         ARM::dsub_0, &ARM::DPRRegClass);
    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
                                         ARM::dsub_1, &ARM::DPRRegClass);

    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);

    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);

    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);

  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);

  } else {
    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
           "Found unexpected regclass!");

    unsigned PrefLane = getPrefSPRLane(Reg);
    unsigned Lane;
    switch (PrefLane) {
      case ARM::ssub_0: Lane = 0; break;
      case ARM::ssub_1: Lane = 1; break;
      default: llvm_unreachable("Unknown preferred lane!");
    }

    // Treat DPair as QPR
    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
                   usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);

    Out = createImplicitDef(MBB, InsertPt, DL);
    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
    eraseInstrWithNoUses(MI);
  }
  return Out;
}

bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
  // We look for instructions that write S registers that are then read as
  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
  // merge two SPR values to form a DPR register.  In order avoid false
  // positives we make sure that there is an SPR producer so we look past
  // COPY and PHI nodes to find it.
  //
  // The best code pattern for when an SPR producer is going to be used by a
  // DPR or QPR consumer depends on whether the other lanes of the
  // corresponding DPR/QPR are currently defined.
  //
  // We can handle these efficiently, depending on the type of
  // pseudo-instruction that is producing the pattern
  //
  //   * COPY:          * VDUP all lanes and merge the results together
  //                      using VEXTs.
  //
  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
  //                      lane, and the other lane(s) of the DPR/QPR register
  //                      that we are inserting in are undefined, use the
  //                      original DPR/QPR value.
  //                    * Otherwise, fall back on the same stategy as COPY.
  //
  //   * REG_SEQUENCE:  * If all except one of the input operands are
  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
  //                      defined input operand
  //                    * Otherwise, fall back on the same stategy as COPY.
  //

  // First, get all the reads of D-registers done by this instruction.
  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
  bool Modified = false;

  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
     I != E; ++I) {
    // Follow the def-use chain for this DPR through COPYs, and also through
    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
    // we can end up with multiple defs of this DPR.

    SmallVector<MachineInstr *, 8> DefSrcs;
    if (!TRI->isVirtualRegister(*I))
      continue;
    MachineInstr *Def = MRI->getVRegDef(*I);
    if (!Def)
      continue;

    elideCopiesAndPHIs(Def, DefSrcs);

    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
      EE = DefSrcs.end(); II != EE; ++II) {
      MachineInstr *MI = *II;

      // If we've already analyzed and replaced this operand, don't do
      // anything.
      if (Replacements.find(MI) != Replacements.end())
        continue;

      // Now, work out if the instruction causes a SPR->DPR dependency.
      if (!hasPartialWrite(MI))
        continue;

      // Collect all the uses of this MI's DPR def for updating later.
      SmallVector<MachineOperand*, 8> Uses;
      unsigned DPRDefReg = MI->getOperand(0).getReg();
      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
             E = MRI->use_end(); I != E; ++I)
        Uses.push_back(&*I);

      // We can optimize this.
      unsigned NewReg = optimizeSDPattern(MI);

      if (NewReg != 0) {
        Modified = true;
        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
               E = Uses.end(); I != E; ++I) {
          // Make sure to constrain the register class of the new register to
          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
          // reference into a plain DPR, and that will end poorly. NewReg is
          // always virtual here, so there will always be a matching subclass
          // to find.
          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));

          DEBUG(dbgs() << "Replacing operand "
                       << **I << " with "
                       << PrintReg(NewReg) << "\n");
          (*I)->substVirtReg(NewReg, 0, *TRI);
        }
      }
      Replacements[MI] = NewReg;
    }
  }
  return Modified;
}

bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
  const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
  // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
  // enabled when NEON is available.
  if (!(STI.isCortexA15() && STI.hasNEON()))
    return false;
  TII = STI.getInstrInfo();
  TRI = STI.getRegisterInfo();
  MRI = &Fn.getRegInfo();
  bool Modified = false;

  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");

  DeadInstr.clear();
  Replacements.clear();

  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
       ++MFI) {

    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
      MI != ME;) {
      Modified |= runOnInstruction(MI++);
    }

  }

  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
                                            E = DeadInstr.end();
                                            I != E; ++I) {
    (*I)->eraseFromParent();
  }

  return Modified;
}

FunctionPass *llvm::createA15SDOptimizerPass() {
  return new A15SDOptimizer();
}