From 3881cb7a5d54c0011b40997adcd742e1c7b91abd Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 29 Sep 2010 22:42:35 +0000 Subject: Model Cortex-a9 load to SUB, RSB, ADD, ADC, SBC, RSC, CMN, MVN, or CMP pipeline forwarding path. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115098 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/ScheduleDAGInstrs.cpp | 37 ++++++------- lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 34 ++++++------ lib/Target/ARM/ARMInstrThumb2.td | 4 +- lib/Target/ARM/ARMSchedule.td | 1 + lib/Target/ARM/ARMScheduleA8.td | 1 + lib/Target/ARM/ARMScheduleA9.td | 69 +++++++++++++++++-------- 6 files changed, 86 insertions(+), 60 deletions(-) (limited to 'lib') diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index da0b056..3d2565d 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -527,26 +527,23 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, MachineInstr *DefMI = Def->getInstr(); int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); if (DefIdx != -1) { - int DefCycle = InstrItins->getOperandCycle(DefMI->getDesc().getSchedClass(), - DefIdx); - if (DefCycle >= 0) { - MachineInstr *UseMI = Use->getInstr(); - const unsigned UseClass = UseMI->getDesc().getSchedClass(); - - // For all uses of the register, calculate the maxmimum latency - int Latency = -1; - for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = UseMI->getOperand(i); - if (!MO.isReg() || !MO.isUse()) - continue; - unsigned MOReg = MO.getReg(); - if (MOReg != Reg) - continue; - - int UseCycle = InstrItins->getOperandCycle(UseClass, i); - if (UseCycle >= 0) - Latency = std::max(Latency, DefCycle - UseCycle + 1); - } + unsigned DefClass = DefMI->getDesc().getSchedClass(); + MachineInstr *UseMI = Use->getInstr(); + unsigned UseClass = UseMI->getDesc().getSchedClass(); + + // For all uses of the register, calculate the maxmimum latency + int Latency = -1; + for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = UseMI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned MOReg = MO.getReg(); + if (MOReg != Reg) + continue; + + int UseCycle = InstrItins->getOperandLatency(DefClass, DefIdx, + UseClass, i); + Latency = std::max(Latency, UseCycle); // If we found a latency, then replace the existing dependence latency. if (Latency >= 0) diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index fbf621d..23ff9c5 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -457,24 +457,24 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use, return; unsigned DefIdx = Use->getOperand(OpIdx).getResNo(); - if (Def->isMachineOpcode()) { - const TargetInstrDesc &II = TII->get(Def->getMachineOpcode()); - if (DefIdx >= II.getNumDefs()) - return; - int DefCycle = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx); - if (DefCycle < 0) - return; - int UseCycle = 1; - if (Use->isMachineOpcode()) { - const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass(); - UseCycle = InstrItins->getOperandCycle(UseClass, OpIdx); - } - if (UseCycle >= 0) { - int Latency = DefCycle - UseCycle + 1; - if (Latency >= 0) - dep.setLatency(Latency); - } + if (!Def->isMachineOpcode()) + return; + + const TargetInstrDesc &II = TII->get(Def->getMachineOpcode()); + if (DefIdx >= II.getNumDefs()) + return; + + int Latency = 0; + if (!Use->isMachineOpcode()) { + Latency = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx); + } else { + unsigned DefClass = II.getSchedClass(); + unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass(); + Latency = InstrItins->getOperandLatency(DefClass, DefIdx, UseClass, OpIdx); } + + if (Latency >= 0) + dep.setLatency(Latency); } void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 5ca21aa..25eca70 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -285,7 +285,7 @@ multiclass T2I_rbin_irs opcod, string opc, PatFrag opnode> { let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, + def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsir, opc, "\t$dst, $rhs, $lhs", [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> { let Inst{31-27} = 0b11101; @@ -1698,7 +1698,7 @@ defm t2ORN : T2I_bin_irs<0b0011, "orn", // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version let AddedComplexity = 1 in defm t2MVN : T2I_un_irs <0b0011, "mvn", - IIC_iMOVi, IIC_iMOVr, IIC_iMOVsi, + IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, UnOpFrag<(not node:$Src)>, 1, 1>; diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 133a81b..00d148b 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -14,6 +14,7 @@ def IIC_iALUx : InstrItinClass; def IIC_iALUi : InstrItinClass; def IIC_iALUr : InstrItinClass; def IIC_iALUsi : InstrItinClass; +def IIC_iALUsir : InstrItinClass; def IIC_iALUsr : InstrItinClass; def IIC_iBITi : InstrItinClass; def IIC_iBITr : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index e6b2bea..8962ec9 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -36,6 +36,7 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData], [2, 2]>, InstrItinData], [2, 2, 2]>, InstrItinData], [2, 2, 1]>, + InstrItinData], [2, 1, 2]>, InstrItinData], [2, 2, 1, 1]>, // // Bitwise Instructions that produce a result diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 14197c8..1f4b8d1 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -23,10 +23,14 @@ def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe def A9_DRegsVFP: FuncUnit; // FP register set, VFP side def A9_DRegsN : FuncUnit; // FP register set, NEON side +// Bypasses +def A9_LdBypass : Bypass; + // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 // def CortexA9Itineraries : ProcessorItineraries< - [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [], [ + [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], + [A9_LdBypass], [ // Two fully-pipelined integer ALU pipelines // @@ -39,19 +43,30 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, // // MVN instructions - InstrItinData], [1]>, - InstrItinData], [1, 1]>, - InstrItinData], [1, 1]>, - InstrItinData], [2, 2, 1]>, + InstrItinData], + [1]>, + InstrItinData], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData], + [1, 1]>, + InstrItinData], + [2, 2, 1]>, // // No operand cycles InstrItinData]>, // // Binary Instructions that produce a result - InstrItinData], [2, 2]>, - InstrItinData], [2, 2, 2]>, - InstrItinData], [2, 2, 1]>, - InstrItinData], [2, 2, 1, 1]>, + InstrItinData], + [2, 2], [NoBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 2], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData], + [2, 1, 2], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, // // Bitwise Instructions that produce a result InstrItinData], [2, 2]>, @@ -69,10 +84,14 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData],[3, 1, 1, 1]>, // // Compare instructions - InstrItinData], [2]>, - InstrItinData], [2, 2]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1, 1]>, + InstrItinData], + [2], [A9_LdBypass]>, + InstrItinData], + [2, 2], [A9_LdBypass, A9_LdBypass]>, + InstrItinData], + [2, 1], [A9_LdBypass, NoBypass]>, + InstrItinData], + [2, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, // // Test instructions InstrItinData], [2]>, @@ -105,31 +124,38 @@ def CortexA9Itineraries : ProcessorItineraries< // // Immediate offset InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrStage<1, [A9_LSPipe]>], + [3, 1], [A9_LdBypass]>, // // Register offset InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_LSPipe]>], + [3, 1, 1], [A9_LdBypass]>, // // Scaled register offset InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [4, 1, 1], [A9_LdBypass]>, // // Immediate offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>, + InstrStage<2, [A9_LSPipe]>], + [3, 2, 1], [A9_LdBypass]>, // // Register offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [3, 2, 1, 1], [A9_LdBypass]>, // // Scaled register offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [4, 3, 1, 1], [A9_LdBypass]>, // // Load multiple InstrItinData, - InstrStage<1, [A9_LSPipe]>]>, + InstrStage<1, [A9_LSPipe]>], + [3], [A9_LdBypass]>, // // Load multiple plus branch @@ -141,7 +167,8 @@ def CortexA9Itineraries : ProcessorItineraries< // iLoadi + iALUr for t2LDRpci_pic. InstrItinData, InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [4, 1]>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>], + [2, 1]>, // Integer store pipeline /// -- cgit v1.1