From 3881cb7a5d54c0011b40997adcd742e1c7b91abd Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Wed, 29 Sep 2010 22:42:35 +0000
Subject: Model Cortex-a9 load to SUB, RSB, ADD, ADC, SBC, RSC, CMN, MVN, or
 CMP pipeline forwarding path.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115098 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/ScheduleDAGInstrs.cpp               | 37 ++++++-------
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 34 ++++++------
 lib/Target/ARM/ARMInstrThumb2.td                |  4 +-
 lib/Target/ARM/ARMSchedule.td                   |  1 +
 lib/Target/ARM/ARMScheduleA8.td                 |  1 +
 lib/Target/ARM/ARMScheduleA9.td                 | 69 +++++++++++++++++--------
 6 files changed, 86 insertions(+), 60 deletions(-)

(limited to 'lib')

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index da0b056..3d2565d 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -527,26 +527,23 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
   MachineInstr *DefMI = Def->getInstr();
   int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
   if (DefIdx != -1) {
-    int DefCycle = InstrItins->getOperandCycle(DefMI->getDesc().getSchedClass(),
-                                               DefIdx);
-    if (DefCycle >= 0) {
-      MachineInstr *UseMI = Use->getInstr();
-      const unsigned UseClass = UseMI->getDesc().getSchedClass();
-
-      // For all uses of the register, calculate the maxmimum latency
-      int Latency = -1;
-      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = UseMI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse())
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MOReg != Reg)
-          continue;
-
-        int UseCycle = InstrItins->getOperandCycle(UseClass, i);
-        if (UseCycle >= 0)
-          Latency = std::max(Latency, DefCycle - UseCycle + 1);
-      }
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    MachineInstr *UseMI = Use->getInstr();
+    unsigned UseClass = UseMI->getDesc().getSchedClass();
+
+    // For all uses of the register, calculate the maxmimum latency
+    int Latency = -1;
+    for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = UseMI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (MOReg != Reg)
+        continue;
+
+      int UseCycle = InstrItins->getOperandLatency(DefClass, DefIdx,
+                                                   UseClass, i);
+      Latency = std::max(Latency, UseCycle);
 
       // If we found a latency, then replace the existing dependence latency.
       if (Latency >= 0)
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index fbf621d..23ff9c5 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -457,24 +457,24 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
     return;
 
   unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
-  if (Def->isMachineOpcode()) {
-    const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
-    if (DefIdx >= II.getNumDefs())
-      return;
-    int DefCycle = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx);
-    if (DefCycle < 0)
-      return;
-    int UseCycle = 1;
-    if (Use->isMachineOpcode()) {
-      const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
-      UseCycle = InstrItins->getOperandCycle(UseClass, OpIdx);
-    }
-    if (UseCycle >= 0) {
-      int Latency = DefCycle - UseCycle + 1;
-      if (Latency >= 0)
-        dep.setLatency(Latency);
-    }
+  if (!Def->isMachineOpcode())
+    return;
+
+  const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
+  if (DefIdx >= II.getNumDefs())
+    return;
+
+  int Latency = 0;
+  if (!Use->isMachineOpcode()) {
+    Latency = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx);
+  } else {
+    unsigned DefClass = II.getSchedClass();
+    unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
+    Latency = InstrItins->getOperandLatency(DefClass, DefIdx, UseClass, OpIdx);
   }
+
+  if (Latency >= 0)
+    dep.setLatency(Latency);
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 5ca21aa..25eca70 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -285,7 +285,7 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsir,
                  opc, "\t$dst, $rhs, $lhs",
                  [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11101;
@@ -1698,7 +1698,7 @@ defm t2ORN  : T2I_bin_irs<0b0011, "orn",
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn",
-                          IIC_iMOVi, IIC_iMOVr, IIC_iMOVsi,
+                          IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
                           UnOpFrag<(not node:$Src)>, 1, 1>;
 
 
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 133a81b..00d148b 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -14,6 +14,7 @@ def IIC_iALUx      : InstrItinClass;
 def IIC_iALUi      : InstrItinClass;
 def IIC_iALUr      : InstrItinClass;
 def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsir    : InstrItinClass;
 def IIC_iALUsr     : InstrItinClass;
 def IIC_iBITi      : InstrItinClass;
 def IIC_iBITr      : InstrItinClass;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index e6b2bea..8962ec9 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -36,6 +36,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
   InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
   InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
   InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
   //
   // Bitwise Instructions that produce a result
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 14197c8..1f4b8d1 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -23,10 +23,14 @@ def A9_NPipe   : FuncUnit; // NEON ALU/MUL pipe
 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
 
+// Bypasses
+def A9_LdBypass : Bypass;
+
 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
 //
 def CortexA9Itineraries : ProcessorItineraries<
-  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [], [
+  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1],
+  [A9_LdBypass], [
   // Two fully-pipelined integer ALU pipelines
 
   //
@@ -39,19 +43,30 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
   //
   // MVN instructions
-  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
-  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMVNsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                              [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
+                              [2, 2, 1]>,
   //
   // No operand cycles
   InstrItinData<IIC_iALUx    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
   //
   // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>,
-  InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
-  InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                            [2, 2], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                            [2, 2, 2], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
+                            [2, 2, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
+                            [2, 1, 2], [NoBypass, NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
+                            [2, 2, 1, 1],
+                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
   //
   // Bitwise Instructions that produce a result
   InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
@@ -69,10 +84,14 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iEXTAsr,[InstrStage<2, [A9_Pipe0, A9_Pipe1]>],[3, 1, 1, 1]>,
   //
   // Compare instructions
-  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                              [2], [A9_LdBypass]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                              [2, 2], [A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
+                              [2, 1], [A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
+                              [2, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
   //
   // Test instructions
   InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
@@ -105,31 +124,38 @@ def CortexA9Itineraries : ProcessorItineraries<
   //
   // Immediate offset
   InstrItinData<IIC_iLoadi   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+                                InstrStage<1, [A9_LSPipe]>],
+                               [3, 1], [A9_LdBypass]>,
   //
   // Register offset
   InstrItinData<IIC_iLoadr   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+                                InstrStage<1, [A9_LSPipe]>],
+                               [3, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset
   InstrItinData<IIC_iLoadsi  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>,
+                                InstrStage<2, [A9_LSPipe]>],
+                               [4, 1, 1], [A9_LdBypass]>,
   //
   // Immediate offset with update
   InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>,
+                                InstrStage<2, [A9_LSPipe]>],
+                               [3, 2, 1], [A9_LdBypass]>,
   //
   // Register offset with update
   InstrItinData<IIC_iLoadru  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>,
+                                InstrStage<2, [A9_LSPipe]>],
+                               [3, 2, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset with update
   InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>,
+                                InstrStage<2, [A9_LSPipe]>],
+                               [4, 3, 1, 1], [A9_LdBypass]>,
   //
   // Load multiple
   InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>]>,
+                                InstrStage<1, [A9_LSPipe]>],
+                               [3], [A9_LdBypass]>,
 
   //
   // Load multiple plus branch
@@ -141,7 +167,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Pipe1]>,
                                 InstrStage<1, [A9_LSPipe]>,
-                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [4, 1]>,
+                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
+                               [2, 1]>,
 
   // Integer store pipeline
   ///
-- 
cgit v1.1