3 files changed, 1000 insertions, 147 deletions
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 2b96c7a..31bd606 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -154,6 +154,8 @@ public:
 
   bool empty() const { return Queue.empty(); }
 
+  void clear() { Queue.clear(); }
+
   unsigned size() const { return Queue.size(); }
 
   typedef std::vector<SUnit*>::iterator iterator;
@@ -171,10 +173,12 @@ public:
     SU->NodeQueueId |= ID;
   }
 
-  void remove(iterator I) {
+  iterator remove(iterator I) {
     (*I)->NodeQueueId &= ~ID;
     *I = Queue.back();
+    unsigned idx = I - Queue.begin();
     Queue.pop_back();
+    return Queue.begin() + idx;
   }
 
 #ifndef NDEBUG
@@ -306,6 +310,9 @@ protected:
   /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
   void placeDebugValues();
 
+  /// \brief dump the scheduled Sequence.
+  void dumpSchedule() const;
+
   // Lesser helpers...
 
   void initRegPressure();
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 2438eb1..de16932 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -49,6 +49,15 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+// Threshold to very roughly model an out-of-order processor's instruction
+// buffers. If the actual value of this threshold matters much in practice, then
+// it can be specified by the machine model. For now, it's an experimental
+// tuning knob to determine when and if it matters.
+static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
+  cl::desc("Allow expected latency to exceed the critical path by N cycles "
+           "before attempting to balance ILP"),
+  cl::init(10U));
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -487,6 +496,13 @@ void ScheduleDAGMI::schedule() {
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = top()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
 }
 
 /// Build the DAG and setup three register pressure trackers.
@@ -627,6 +643,17 @@ void ScheduleDAGMI::placeDebugValues() {
   FirstDbgValue = NULL;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGMI::dumpSchedule() const {
+  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
+    if (SUnit *SU = getSUnit(&(*MI)))
+      SU->dump(this);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
@@ -635,33 +662,127 @@ namespace {
 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class ConvergingScheduler : public MachineSchedStrategy {
+public:
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
+    BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
+    SingleMax, MultiPressure, NextDefUse, NodeOrder};
+
+#ifndef NDEBUG
+  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+#endif
+
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
+
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
+
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
+
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
+
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
 
   /// Store the state used by ConvergingScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
+    CandPolicy Policy;
+
     // The best SUnit candidate.
     SUnit *SU;
 
+    // The reason for this candidate.
+    CandReason Reason;
+
     // Register pressure values for the best candidate.
     RegPressureDelta RPDelta;
 
-    SchedCandidate(): SU(NULL) {}
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+    : Policy(policy), SU(NULL), Reason(NoCand) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+  /// Summarize the unscheduled region.
+  struct SchedRemainder {
+    // Critical path through the DAG in expected latency.
+    unsigned CriticalPath;
+
+    // Unscheduled resources
+    SmallVector<unsigned, 16> RemainingCounts;
+    // Critical resource for the unscheduled zone.
+    unsigned CritResIdx;
+    // Number of micro-ops left to schedule.
+    unsigned RemainingMicroOps;
+    // Is the unscheduled zone resource limited.
+    bool IsResourceLimited;
+
+    unsigned MaxRemainingCount;
+
+    void reset() {
+      CriticalPath = 0;
+      RemainingCounts.clear();
+      CritResIdx = 0;
+      RemainingMicroOps = 0;
+      IsResourceLimited = false;
+      MaxRemainingCount = 0;
+    }
+
+    SchedRemainder() { reset(); }
+
+    void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
   };
-  /// Represent the type of SchedCandidate found within a single queue.
-  enum CandResult {
-    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
 
   /// Each Scheduling boundary is associated with ready queues. It tracks the
-  /// current cycle in whichever direction at has moved, and maintains the state
+  /// current cycle in the direction of movement, and maintains the state
   /// of "hazards" and other interlocks at the current cycle.
   struct SchedBoundary {
     ScheduleDAGMI *DAG;
     const TargetSchedModel *SchedModel;
+    SchedRemainder *Rem;
 
     ReadyQueue Available;
     ReadyQueue Pending;
     bool CheckPending;
 
+    // For heuristics, keep a list of the nodes that immediately depend on the
+    // most recently scheduled node.
+    SmallPtrSet<const SUnit*, 8> NextSUs;
+
     ScheduleHazardRecognizer *HazardRec;
 
     unsigned CurrCycle;
@@ -670,34 +791,88 @@ class ConvergingScheduler : public MachineSchedStrategy {
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle;
 
+    // The expected latency of the critical path in this scheduled zone.
+    unsigned ExpectedLatency;
+
+    // Resources used in the scheduled zone beyond this boundary.
+    SmallVector<unsigned, 16> ResourceCounts;
+
+    // Cache the critical resources ID in this scheduled zone.
+    unsigned CritResIdx;
+
+    // Is the scheduled region resource limited vs. latency limited.
+    bool IsResourceLimited;
+
+    unsigned ExpectedCount;
+
+    // Policy flag: attempt to find ILP until expected latency is covered.
+    bool ShouldIncreaseILP;
+
+#ifndef NDEBUG
     // Remember the greatest min operand latency.
     unsigned MaxMinLatency;
+#endif
+
+    void reset() {
+      Available.clear();
+      Pending.clear();
+      CheckPending = false;
+      NextSUs.clear();
+      HazardRec = 0;
+      CurrCycle = 0;
+      IssueCount = 0;
+      MinReadyCycle = UINT_MAX;
+      ExpectedLatency = 0;
+      ResourceCounts.resize(1);
+      assert(!ResourceCounts[0] && "nonzero count for bad resource");
+      CritResIdx = 0;
+      IsResourceLimited = false;
+      ExpectedCount = 0;
+      ShouldIncreaseILP = false;
+#ifndef NDEBUG
+      MaxMinLatency = 0;
+#endif
+      // Reserve a zero-count for invalid CritResIdx.
+      ResourceCounts.resize(1);
+    }
 
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
-      DAG(0), SchedModel(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
-      CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
-      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+      DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") {
+      reset();
+    }
 
     ~SchedBoundary() { delete HazardRec; }
 
-    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel) {
-      DAG = dag;
-      SchedModel = smodel;
-    }
+    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
+              SchedRemainder *rem);
 
     bool isTop() const {
       return Available.getID() == ConvergingScheduler::TopQID;
     }
 
+    unsigned getUnscheduledLatency(SUnit *SU) const {
+      if (isTop())
+        return SU->getHeight();
+      return SU->getDepth();
+    }
+
+    unsigned getCriticalCount() const {
+      return ResourceCounts[CritResIdx];
+    }
+
     bool checkHazard(SUnit *SU);
 
+    void checkILPPolicy();
+
     void releaseNode(SUnit *SU, unsigned ReadyCycle);
 
     void bumpCycle();
 
+    void countResource(unsigned PIdx, unsigned Cycles);
+
     void bumpNode(SUnit *SU);
 
     void releasePending();
@@ -707,11 +882,13 @@ class ConvergingScheduler : public MachineSchedStrategy {
     SUnit *pickOnlyChoice();
   };
 
+private:
   ScheduleDAGMI *DAG;
   const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
 
   // State of the top and bottom scheduled instruction boundaries.
+  SchedRemainder Rem;
   SchedBoundary Top;
   SchedBoundary Bot;
 
@@ -736,25 +913,75 @@ public:
 
   virtual void releaseBottomNode(SUnit *SU);
 
+  virtual void registerRoots();
+
 protected:
-  SUnit *pickNodeBidrectional(bool &IsTopNode);
+  void balanceZones(
+    ConvergingScheduler::SchedBoundary &CriticalZone,
+    ConvergingScheduler::SchedCandidate &CriticalCand,
+    ConvergingScheduler::SchedBoundary &OppositeZone,
+    ConvergingScheduler::SchedCandidate &OppositeCand);
+
+  void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
+                           ConvergingScheduler::SchedCandidate &BotCand);
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
 
-  CandResult pickNodeFromQueue(ReadyQueue &Q,
-                               const RegPressureTracker &RPTracker,
-                               SchedCandidate &Candidate);
 #ifndef NDEBUG
-  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+  void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone);
 #endif
 };
 } // namespace
 
+void ConvergingScheduler::SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (std::vector<SUnit>::iterator
+         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+    RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      RemainingCounts[PIdx] += (Factor * PI->Cycles);
+    }
+  }
+}
+
+void ConvergingScheduler::SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel())
+    ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
+}
+
 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
-  Top.init(DAG, SchedModel);
-  Bot.init(DAG, SchedModel);
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
   // are disabled, then these HazardRecs will be disabled.
@@ -803,6 +1030,17 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
+void ConvergingScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (std::vector<SUnit*>::const_iterator
+         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+}
+
 /// Does this SU have a hazard within the current instruction group.
 ///
 /// The scheduler supports two modes of hazard recognition. The first is the
@@ -821,14 +1059,26 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
-  if (IssueCount + uops > SchedModel->getIssueWidth())
+  if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
+    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
-
+  }
   return false;
 }
 
+/// If expected latency is covered, disable ILP policy.
+void ConvergingScheduler::SchedBoundary::checkILPPolicy() {
+  if (ShouldIncreaseILP
+      && (IsResourceLimited || ExpectedLatency <= CurrCycle)) {
+    ShouldIncreaseILP = false;
+    DEBUG(dbgs() << "Disable ILP: " << Available.getName() << '\n');
+  }
+}
+
 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
+
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
@@ -838,6 +1088,18 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
     Pending.push(SU);
   else
     Available.push(SU);
+
+  // Record this node as an immediate dependent of the scheduled node.
+  NextSUs.insert(SU);
+
+  // If CriticalPath has been computed, then check if the unscheduled nodes
+  // exceed the ILP window. Before registerRoots, CriticalPath==0.
+  if (Rem->CriticalPath && (ExpectedLatency + getUnscheduledLatency(SU)
+                            > Rem->CriticalPath + ILPWindow)) {
+    ShouldIncreaseILP = true;
+    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << " "
+          << ExpectedLatency << " + " << getUnscheduledLatency(SU) << '\n');
+  }
 }
 
 /// Move the boundary of scheduled code by one cycle.
@@ -845,8 +1107,12 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
   unsigned Width = SchedModel->getIssueWidth();
   IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
 
+  unsigned NextCycle = CurrCycle + 1;
   assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
-  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+  if (MinReadyCycle > NextCycle) {
+    IssueCount = 0;
+    NextCycle = MinReadyCycle;
+  }
 
   if (!HazardRec->isEnabled()) {
     // Bypass HazardRec virtual calls.
@@ -862,11 +1128,39 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
     }
   }
   CheckPending = true;
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
 
-  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+  DEBUG(dbgs() << "  *** " << Available.getName() << " cycle "
         << CurrCycle << '\n');
 }
 
+/// Add the given processor resource to this scheduled zone.
+void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
+                                                       unsigned Cycles) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  DEBUG(dbgs() << "  " << SchedModel->getProcResource(PIdx)->Name
+        << " +(" << Cycles << "x" << Factor
+        << ") / " << SchedModel->getLatencyFactor() << '\n');
+
+  unsigned Count = Factor * Cycles;
+  ResourceCounts[PIdx] += Count;
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Reset MaxRemainingCount for sanity.
+  Rem->MaxRemainingCount = 0;
+
+  // Check if this resource exceeds the current critical resource by a full
+  // cycle. If so, it becomes the critical resource.
+  if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
+      >= (int)SchedModel->getLatencyFactor()) {
+    CritResIdx = PIdx;
+    DEBUG(dbgs() << "  *** Critical resource "
+          << SchedModel->getProcResource(PIdx)->Name << " x"
+          << ResourceCounts[PIdx] << '\n');
+  }
+}
+
 /// Move the boundary of scheduled code by one SUnit.
 void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
@@ -878,11 +1172,38 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     }
     HazardRec->EmitInstruction(SU);
   }
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      countResource(PI->ProcResourceIdx, PI->Cycles);
+    }
+  }
+  if (isTop()) {
+    if (SU->getDepth() > ExpectedLatency)
+      ExpectedLatency = SU->getDepth();
+  }
+  else {
+    if (SU->getHeight() > ExpectedLatency)
+      ExpectedLatency = SU->getHeight();
+  }
+
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+
   // Check the instruction group dispatch limit.
   // TODO: Check if this SU must end a dispatch group.
   IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+
+  // checkHazard prevents scheduling multiple instructions per cycle that exceed
+  // issue width. However, we commonly reach the maximum. In this case
+  // opportunistically bump the cycle to avoid uselessly checking everything in
+  // the readyQ. Furthermore, a single instruction may produce more than one
+  // cycle's worth of micro-ops.
   if (IssueCount >= SchedModel->getIssueWidth()) {
-    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    DEBUG(dbgs() << "  *** Max instrs at cycle " << CurrCycle << '\n');
     bumpCycle();
   }
 }
@@ -913,6 +1234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
     Pending.remove(Pending.begin()+i);
     --i; --e;
   }
+  DEBUG(if (!Pending.empty()) Pending.dump());
   CheckPending = false;
 }
 
@@ -927,12 +1249,23 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 }
 
 /// If this queue only has one ready candidate, return it. As a side effect,
-/// advance the cycle until at least one node is ready. If multiple instructions
-/// are ready, return NULL.
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
+  if (IssueCount > 0) {
+    // Defer any ready instrs that now have a hazard.
+    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+      if (checkHazard(*I)) {
+        Pending.push(*I);
+        I = Available.remove(I);
+        continue;
+      }
+      ++I;
+    }
+  }
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
@@ -944,18 +1277,262 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   return NULL;
 }
 
-#ifndef NDEBUG
-void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
-                                         SUnit *SU, PressureElement P) {
-  dbgs() << Label << " " << Q.getName() << " ";
-  if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
-  else
-    dbgs() << "     ";
-  SU->dump(DAG);
+/// Record the candidate policy for opposite zones with different critical
+/// resources.
+///
+/// If the CriticalZone is latency limited, don't force a policy for the
+/// candidates here. Instead, When releasing each candidate, releaseNode
+/// compares the region's critical path to the candidate's height or depth and
+/// the scheduled zone's expected latency then sets ShouldIncreaseILP.
+void ConvergingScheduler::balanceZones(
+  ConvergingScheduler::SchedBoundary &CriticalZone,
+  ConvergingScheduler::SchedCandidate &CriticalCand,
+  ConvergingScheduler::SchedBoundary &OppositeZone,
+  ConvergingScheduler::SchedCandidate &OppositeCand) {
+
+  if (!CriticalZone.IsResourceLimited)
+    return;
+
+  SchedRemainder *Rem = CriticalZone.Rem;
+
+  // If the critical zone is overconsuming a resource relative to the
+  // remainder, try to reduce it.
+  unsigned RemainingCritCount =
+    Rem->RemainingCounts[CriticalZone.CritResIdx];
+  if ((int)(Rem->MaxRemainingCount - RemainingCritCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
+          << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
+          << '\n');
+  }
+  // If the other zone is underconsuming a resource relative to the full zone,
+  // try to increase it.
+  unsigned OppositeCount =
+    OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
+  if ((int)(OppositeZone.ExpectedCount - OppositeCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
+          << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
+          << '\n');
+  }
+}
+
+/// Determine if the scheduled zones exceed resource limits or critical path and
+/// set each candidate's ReduceHeight policy accordingly.
+void ConvergingScheduler::checkResourceLimits(
+  ConvergingScheduler::SchedCandidate &TopCand,
+  ConvergingScheduler::SchedCandidate &BotCand) {
+
+  Bot.checkILPPolicy();
+  Top.checkILPPolicy();
+  if (Bot.ShouldIncreaseILP)
+    BotCand.Policy.ReduceLatency = true;
+  if (Top.ShouldIncreaseILP)
+    TopCand.Policy.ReduceLatency = true;
+
+  // Handle resource-limited regions.
+  if (Top.IsResourceLimited && Bot.IsResourceLimited
+      && Top.CritResIdx == Bot.CritResIdx) {
+    // If the scheduled critical resource in both zones is no longer the
+    // critical remaining resource, attempt to reduce resource height both ways.
+    if (Top.CritResIdx != Rem.CritResIdx) {
+      TopCand.Policy.ReduceResIdx = Top.CritResIdx;
+      BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
+      DEBUG(dbgs() << "Reduce scheduled "
+            << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
+    }
+    return;
+  }
+  // Handle latency-limited regions.
+  if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
+    // If the total scheduled expected latency exceeds the region's critical
+    // path then reduce latency both ways.
+    //
+    // Just because a zone is not resource limited does not mean it is latency
+    // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
+    // to exceed expected latency.
+    if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
+        && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
+      TopCand.Policy.ReduceLatency = true;
+      BotCand.Policy.ReduceLatency = true;
+      DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
+            << " + " << Bot.ExpectedLatency << '\n');
+    }
+    return;
+  }
+  // The critical resource is different in each zone, so request balancing.
+
+  // Compute the cost of each zone.
+  Rem.MaxRemainingCount = std::max(
+    Rem.RemainingMicroOps * SchedModel->getMicroOpFactor(),
+    Rem.RemainingCounts[Rem.CritResIdx]);
+  Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
+  Top.ExpectedCount = std::max(
+    Top.getCriticalCount(),
+    Top.ExpectedCount * SchedModel->getLatencyFactor());
+  Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
+  Bot.ExpectedCount = std::max(
+    Bot.getCriticalCount(),
+    Bot.ExpectedCount * SchedModel->getLatencyFactor());
+
+  balanceZones(Top, TopCand, Bot, BotCand);
+  balanceZones(Bot, BotCand, Top, TopCand);
+}
+
+void ConvergingScheduler::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Return true if this heuristic determines order.
+static bool tryLess(unsigned TryVal, unsigned CandVal,
+                    ConvergingScheduler::SchedCandidate &TryCand,
+                    ConvergingScheduler::SchedCandidate &Cand,
+                    ConvergingScheduler::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+static bool tryGreater(unsigned TryVal, unsigned CandVal,
+                       ConvergingScheduler::SchedCandidate &TryCand,
+                       ConvergingScheduler::SchedCandidate &Cand,
+                       ConvergingScheduler::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending.
+/// \param RPTracker describes reg pressure within the scheduled zone.
+/// \param TempTracker is a scratch pressure tracker to reuse in queries.
+void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+                                       SchedCandidate &TryCand,
+                                       SchedBoundary &Zone,
+                                       const RegPressureTracker &RPTracker,
+                                       RegPressureTracker &TempTracker) {
+
+  // Always initialize TryCand's RPDelta.
+  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
+                                  DAG->getRegionCriticalPSets(),
+                                  DAG->getRegPressure().MaxSetPressure);
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+  // Avoid exceeding the target's limit.
+  if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
+              Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
+    return;
+  if (Cand.Reason == SingleExcess)
+    Cand.Reason = MultiPressure;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
+              Cand.RPDelta.CriticalMax.UnitIncrease,
+              TryCand, Cand, SingleCritical))
+    return;
+  if (Cand.Reason == SingleCritical)
+    Cand.Reason = MultiPressure;
+
+  // Avoid critical resource consumption and balance the schedule.
+  TryCand.initResourceDelta(DAG, SchedModel);
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency) {
+    if (Zone.isTop()) {
+      if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                    TryCand, Cand, TopDepthReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                     TryCand, Cand, TopPathReduce))
+        return;
+    }
+    else {
+      if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                    TryCand, Cand, BotHeightReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                     TryCand, Cand, BotPathReduce))
+        return;
+    }
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
+              Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
+    return;
+  if (Cand.Reason == SingleMax)
+    Cand.Reason = MultiPressure;
+
+  // Prefer immediate defs/users of the last scheduled instruction. This is a
+  // nice pressure avoidance strategy that also conserves the processor's
+  // register renaming resources and keeps the machine code readable.
+  if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
+    TryCand.Reason = NextDefUse;
+    return;
+  }
+  if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
+    if (Cand.Reason > NextDefUse)
+      Cand.Reason = NextDefUse;
+    return;
+  }
+  // Fall through to original instruction order.
+  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+    TryCand.Reason = NodeOrder;
+  }
 }
-#endif
 
 /// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
 /// more desirable than RHS from scheduling standpoint.
@@ -966,109 +1543,143 @@ static bool compareRPDelta(const RegPressureDelta &LHS,
   // have UnitIncrease==0, so are neutral.
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
+    DEBUG(dbgs() << "RP excess top - bot: "
+          << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
     return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
-
+  }
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP critical top - bot: "
+          << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
+          << '\n');
     return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
-
+  }
   // Avoid increasing the max pressure of the entire region.
-  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP current top - bot: "
+          << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
+          << '\n');
     return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
-
+  }
   return false;
 }
 
+#ifndef NDEBUG
+const char *ConvergingScheduler::getReasonStr(
+  ConvergingScheduler::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case SingleExcess:   return "REG-EXCESS";
+  case SingleCritical: return "REG-CRIT  ";
+  case SingleMax:      return "REG-MAX   ";
+  case MultiPressure:  return "REG-MULTI ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+}
+
+void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
+                                         const SchedBoundary &Zone) {
+  const char *Label = getReasonStr(Cand.Reason);
+  PressureElement P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case SingleExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case SingleCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case SingleMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << Label << " " << Zone.Available.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  if (ResIdx)
+    dbgs() << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "        ";
+  if (Latency)
+    dbgs() << Latency << " cycles ";
+  else
+    dbgs() << "         ";
+  Cand.SU->dump(DAG);
+}
+#endif
+
 /// Pick the best candidate from the top queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-ConvergingScheduler::CandResult ConvergingScheduler::
-pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
-                  SchedCandidate &Candidate) {
+void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                            const RegPressureTracker &RPTracker,
+                                            SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
+
   DEBUG(Q.dump());
 
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
-  // BestSU remains NULL if no top candidates beat the best existing candidate.
-  CandResult FoundCandidate = NoCand;
   for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
-    RegPressureDelta RPDelta;
-    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
-                                    DAG->getRegionCriticalPSets(),
-                                    DAG->getRegPressure().MaxSetPressure);
-
-    // Initialize the candidate if needed.
-    if (!Candidate.SU) {
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
-      continue;
-    }
-    // Avoid exceeding the target's limit.
-    if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
-      DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleExcess;
-      continue;
-    }
-    if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleExcess)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max critical pressure in the scheduled region.
-    if (RPDelta.CriticalMax.UnitIncrease
-        < Candidate.RPDelta.CriticalMax.UnitIncrease) {
-      DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleCritical;
-      continue;
-    }
-    if (RPDelta.CriticalMax.UnitIncrease
-        > Candidate.RPDelta.CriticalMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleCritical)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max pressure of the entire region.
-    if (RPDelta.CurrentMax.UnitIncrease
-        < Candidate.RPDelta.CurrentMax.UnitIncrease) {
-      DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleMax;
-      continue;
-    }
-    if (RPDelta.CurrentMax.UnitIncrease
-        > Candidate.RPDelta.CurrentMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleMax)
-      FoundCandidate = MultiPressure;
-
-    // Fall through to original instruction order.
-    // Only consider node order if Candidate was chosen from this Q.
-    if (FoundCandidate == NoCand)
-      continue;
 
-    if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
-        || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
-      DEBUG(traceCandidate("NCAND", Q, *I));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = *I;
+    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand, Zone));
     }
+    TryCand.SU = *I;
   }
-  return FoundCandidate;
+}
+
+static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+                      bool IsTop) {
+  DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot")
+        << " SU(" << Cand.SU->NodeNum << ") "
+        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
+SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -1079,11 +1690,14 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
     IsTopNode = true;
     return SU;
   }
-  SchedCandidate BotCand;
+  CandPolicy NoPolicy;
+  SchedCandidate BotCand(NoPolicy);
+  SchedCandidate TopCand(NoPolicy);
+  checkResourceLimits(TopCand, BotCand);
+
   // Prefer bottom scheduling when heuristics are silent.
-  CandResult BotResult = pickNodeFromQueue(Bot.Available,
-                                           DAG->getBotRPTracker(), BotCand);
-  assert(BotResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
 
   // If either Q has a single candidate that provides the least increase in
   // Excess pressure, we can immediately schedule from that Q.
@@ -1092,37 +1706,41 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // affects picking from either Q. If scheduling in one direction must
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
-  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+  if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
   // Check if the top Q has a better candidate.
-  SchedCandidate TopCand;
-  CandResult TopResult = pickNodeFromQueue(Top.Available,
-                                           DAG->getTopRPTracker(), TopCand);
-  assert(TopResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
 
-  if (TopResult == SingleExcess || TopResult == SingleCritical) {
-    IsTopNode = true;
-    return TopCand.SU;
-  }
   // If either Q has a single candidate that minimizes pressure above the
   // original region's pressure pick it.
-  if (BotResult == SingleMax) {
+  if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
+    if (TopCand.Reason < BotCand.Reason) {
+      IsTopNode = true;
+      tracePick(TopCand, IsTopNode);
+      return TopCand.SU;
+    }
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
-  if (TopResult == SingleMax) {
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Check for a salient pressure difference and pick the best from either side.
-  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+  // Otherwise prefer the bottom candidate, in node order if all else failed.
+  if (TopCand.Reason < BotCand.Reason) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Otherwise prefer the bottom candidate in node order.
   IsTopNode = false;
+  tracePick(BotCand, IsTopNode);
   return BotCand.SU;
 }
 
@@ -1138,11 +1756,10 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
     if (ForceTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
-        SchedCandidate TopCand;
-        CandResult TopResult =
-          pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
-        assert(TopResult != NoCand && "failed to find the first candidate");
-        (void)TopResult;
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
@@ -1150,17 +1767,16 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
     else if (ForceBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
-        SchedCandidate BotCand;
-        CandResult BotResult =
-          pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
-        assert(BotResult != NoCand && "failed to find the first candidate");
-        (void)BotResult;
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     }
     else {
-      SU = pickNodeBidrectional(IsTopNode);
+      SU = pickNodeBidirectional(IsTopNode);
     }
   } while (SU->isScheduled);
 
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000..2184d9e
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}